From 8f75a3c36cd6dc30c256a14c87187cd5244483b1 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 17 Jan 2024 02:37:56 +0000
Subject: [PATCH 01/11] backup

backup

bug fix for Arc_XMX

bugfix for dg2

modify UT of Dg2 fp16

revert debugging changes
---
 CMakeLists.txt                                |   1 -
 _clang-format                                 | 122 ++++++++++++++++++
 examples/01_gemm_universal/gemm_universal.cpp |  14 +-
 examples/02_basic_gemm/basic_gemm.cpp         |   5 +-
 .../scaled_dot_product_attention.cpp          |  18 +--
 .../softmax.hpp                               |   4 +-
 include/common/common.hpp                     |   6 +
 include/common/utils/common.hpp               |   4 +-
 include/common/utils/raw_send_nbarrier.hpp    |   7 +-
 include/group/gemm/compute_policy.hpp         |  14 +-
 include/group/gemm/impl/default_xmx_xe.hpp    |   2 +-
 include/kernel/gemm/default_gemm.hpp          |  30 ++---
 include/kernel/gemm/impl/default_xe.hpp       |   2 +-
 include/kernel/gemm/impl/kslicing_xe.hpp      |   2 +-
 include/subgroup/tile/impl/payload_xe.hpp     |   9 +-
 include/subgroup/tile/impl/prefetch_xe.hpp    |   2 +-
 .../subgroup/tile/impl/tile_op_functor.hpp    |  14 +-
 tests/integration/gemm/fp16/common.hpp        |  20 +++
 tests/integration/gemm/fp16/kernel_func.hpp   |  12 +-
 tests/integration/gemm/fp16/main.cpp          |   4 +-
 tests/unit/tile_load_store/main.cpp           |   9 +-
 21 files changed, 228 insertions(+), 73 deletions(-)
 create mode 100644 _clang-format

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ebfee8f09..2b46837d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,6 @@ else() # Windows
 endif() 
 
 project(XeTLA)
-
 include(CTest)
 enable_testing()
 
diff --git a/_clang-format b/_clang-format
new file mode 100644
index 000000000..eee0a4ee7
--- /dev/null
+++ b/_clang-format
@@ -0,0 +1,122 @@
+#===============================================================================
+# Copyright 2016-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+---
+Language:        Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: DontAlign
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 8
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+FixNamespaceComments: true
+ForEachMacros:
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex: '<[[:alnum:].]+>'
+    Priority: 0
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: true
+# IndentPPDirectives: AfterHash
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+ReflowComments:  false
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+StatementMacros:
+  - for_
+  - PRAGMA_OMP
+  - PRAGMA_OMP_SIMD
+TabWidth:        4
+UseTab:          Never
+...
+# vim:ft=conf et ts=2 sw=2
diff --git a/examples/01_gemm_universal/gemm_universal.cpp b/examples/01_gemm_universal/gemm_universal.cpp
index 55d94249d..ed566cea3 100644
--- a/examples/01_gemm_universal/gemm_universal.cpp
+++ b/examples/01_gemm_universal/gemm_universal.cpp
@@ -33,9 +33,9 @@ void gemm_universal_run(uint32_t iter) {
     size_t size_b = matrix_k * matrix_n;
     size_t size_c = matrix_m * matrix_n;
 
-    using data_type_a = bf16;
-    using data_type_b = bf16;
-    using data_type_c = bf16;
+    using data_type_a = fp16;
+    using data_type_b = fp16;
+    using data_type_c = fp16;
     using data_type_acc = float;
 
     //Turn on the profiling property to facilitate subsequent profiling
@@ -91,7 +91,11 @@ void gemm_universal_run(uint32_t iter) {
                     tune_key_value::dispatch_policy_kslicing>,
             elem_v_t<tune_key::global_kslicing_ratio, num_global_splitk>,
             elem_v_t<tune_key::local_kslicing_ratio, num_local_splitk>,
-            elem_t_t<tune_key::wg_tile_shape, shape<wg_tile_n, wg_tile_m>>>;
+            elem_t_t<tune_key::wg_tile_shape, shape<wg_tile_n, wg_tile_m>>,
+            elem_t_t<tune_key::group_swizzle_policy,
+                    gpu::xetla::kernel::group_swizzle_default<gpu_arch::Dg2>>,
+            elem_t_t<tune_key::epilogue_policy,
+                    gpu::xetla::group::epilogue_policy_default<gpu_arch::Dg2>>>;
     using gemm_op_t = gpu::xetla::kernel::default_gemm_t<
             data_type_a, // input datatype for A
             mem_layout::row_major, // memory layout for A
@@ -103,7 +107,7 @@ void gemm_universal_run(uint32_t iter) {
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
             data_type_acc, // accumulator data type for intermediate resutls
-            gpu_arch::Xe, // GPU arch
+            gpu_arch::Dg2, // GPU arch
             tune_option>;
 
     // allocate temp buffers for global split
diff --git a/examples/02_basic_gemm/basic_gemm.cpp b/examples/02_basic_gemm/basic_gemm.cpp
index 918a78e10..330e85e09 100644
--- a/examples/02_basic_gemm/basic_gemm.cpp
+++ b/examples/02_basic_gemm/basic_gemm.cpp
@@ -135,7 +135,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                         data_type_acc, // accumulator data type for intermediate resutls
                         wg_shape, // computation tile shape
                         k_stride, // elements in each iteration
-                        gpu_arch::Xe, // GPU arch
+                        arch_tag_, // GPU arch
                         gemm_tune_option>;
                 gemm_t gemm;
 
@@ -149,7 +149,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                         mem_space::global, // memory writing to global mem for C
                         wg_shape, // computation tile shape
                         k_stride, // elements in each iteration
-                        gpu_arch::Xe, // GPU arch
+                        arch_tag_, // GPU arch
                         epilogue_tune_option>;
 
                 // Step 3: define the shared local memory usages
@@ -194,7 +194,6 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                 // the results is in the matAcc rather than real output C
                 typename gemm_t::work_group_t g(item.get_local_linear_id());
                 gemm(g, matAcc, gemm_args);
-
                 // Step 7: write the results from matACC to real output C
                 epilogue_t epilogue;
                 epilogue(g, matAcc, md_c);
diff --git a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
index cb61058f6..0060c299e 100644
--- a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -254,14 +254,14 @@ void sdp_fwd_run(uint32_t iter) {
                     using wg_shape0 = shape<wg_tile_n_qk, wg_tile_m_qk>;
                     using sg_shape0 = shape<sg_tile_n_qk, sg_tile_m_qk>;
 
-                    using post_op0_t = scalar_mul_op_t<float, gpu_arch::Xe>;
+                    using post_op0_t = scalar_mul_op_t<float, gpu_arch::Dg2>;
                     using post_op1_t = elemwise_reduce_op_t<reduce_op::sum,
-                            dtype_in, gpu_arch::Xe>;
+                            dtype_in, gpu_arch::Dg2>;
                     using post_op_t = chained_tile_op_t<post_op0_t, post_op1_t>;
                     using epilogue_policy0
                             = xetla::group::epilogue_policy_tile_op<post_op_t,
-                                    gpu_arch::Xe>;
-                    using group_swizzle = group_swizzle_default<gpu_arch::Xe>;
+                                    gpu_arch::Dg2>;
+                    using group_swizzle = group_swizzle_default<gpu_arch::Dg2>;
 
                     using tune_option0 = dict_t<
                             elem_v_t<tune_key::param_optimizer_type,
@@ -288,7 +288,7 @@ void sdp_fwd_run(uint32_t iter) {
                             float, // accumulator data type for intermediate resutls
                             wg_shape0, // computation tile shape
                             wg_tile_k_qk, // elements in each iteration
-                            gpu_arch::Xe, // GPU arch
+                            gpu_arch::Dg2, // GPU arch
                             tune_option0>;
                     using epilogue0_t = xetla::group::default_epilogue_selector_t<
                             dtype_sfx, // onput datatype for C
@@ -298,7 +298,7 @@ void sdp_fwd_run(uint32_t iter) {
                                     local, // memory writing to local mem for C
                             wg_shape0, // computation tile shape
                             wg_tile_k_qk, // elements in each iteration
-                            gpu_arch::Xe, // GPU arch
+                            gpu_arch::Dg2, // GPU arch
                             tune_option0>;
                     using gemm_op0_t = gemm_universal_t<
                             dispatch_policy_default<group_swizzle>, gemm0_t,
@@ -315,7 +315,7 @@ void sdp_fwd_run(uint32_t iter) {
                     // we only need to do thread sync while store gemm results to SLM
                     // one barrier is enough for that
                     xetla_nbarrier_init<1>();
-                    xetla_nbarrier_t<thread_num, thread_num, gpu_arch::Xe>
+                    xetla_nbarrier_t<thread_num, thread_num, gpu_arch::Dg2>
                             nbarrier;
                     nbarrier.init_nbarrier(0, nbarrier_role::producer_consumer);
 
@@ -386,7 +386,7 @@ void sdp_fwd_run(uint32_t iter) {
                             float, // accumulator data type for intermediate resutls
                             wg_shape1, // computation tile shape
                             wg_tile_k_sv, // elements in each iteration
-                            gpu_arch::Xe, // GPU arch
+                            gpu_arch::Dg2, // GPU arch
                             tune_option1>;
 
                     // gemm arguments include matA & matB load information and
@@ -465,7 +465,7 @@ void sdp_fwd_run(uint32_t iter) {
                                 0);
                         xetla_tstore_global<dtype_out, sg_tile_n_sv,
                                 cache_hint::write_back, cache_hint::write_back,
-                                gpu_arch::Xe>(transpose_tdecs, out_reg);
+                                gpu_arch::Dg2>(transpose_tdecs, out_reg);
                     }
                 });
             });
diff --git a/examples/08_scaled_dot_product_attention/softmax.hpp b/examples/08_scaled_dot_product_attention/softmax.hpp
index 184bc311b..58fb1c688 100644
--- a/examples/08_scaled_dot_product_attention/softmax.hpp
+++ b/examples/08_scaled_dot_product_attention/softmax.hpp
@@ -57,7 +57,7 @@ struct xetla_softmax_fwd_t {
             mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>,
             softmax_tile_desc_t,
             subgroup::msg_type_v<softmax_tile_desc_t, mem_space_in>,
-            gpu_arch::Xe>;
+            gpu_arch::Dg2>;
 
     // this tile will store the softmax result to global memory
     using softmax_store_t = subgroup::tile_t<dtype_out, softmax_tile_desc_t>;
@@ -65,7 +65,7 @@ struct xetla_softmax_fwd_t {
             mem_desc_t<dtype_out, mem_layout::row_major, mem_space_out>,
             softmax_tile_desc_t,
             subgroup::msg_type_v<softmax_tile_desc_t, mem_space_out>,
-            gpu_arch::Xe>;
+            gpu_arch::Dg2>;
 
     struct arguments_t {
         // available while original data is from SLM
diff --git a/include/common/common.hpp b/include/common/common.hpp
index 97c831c4b..cccc09bbb 100644
--- a/include/common/common.hpp
+++ b/include/common/common.hpp
@@ -21,3 +21,9 @@
 
 #include <common/core/core.hpp>
 #include <common/utils/utils.hpp>
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define CONSTANT __attribute__((opencl_constant))
+#else
+#define CONSTANT
+#endif
diff --git a/include/common/utils/common.hpp b/include/common/utils/common.hpp
index e67ccc829..52a4a61e3 100644
--- a/include/common/utils/common.hpp
+++ b/include/common/utils/common.hpp
@@ -46,7 +46,7 @@ constexpr uint32_t get_element_size_code() {
 enum class lsc_action : uint8_t { prefetch, load, store, atomic };
 
 template <lsc_action Action, cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
-constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, void>
+constexpr std::enable_if_t<arch_tag <= gpu_arch::Xe, void>
 check_lsc_cache_hint() {
     if constexpr (Action == lsc_action::prefetch) {
         // https://gfxspecs.intel.com/Predator/Home/Index/53560
@@ -126,7 +126,7 @@ get_prefetch_cache_hint_code() {
 }
 
 template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
-constexpr std::enable_if_t<arch_tag == gpu_arch::Xe, uint32_t>
+constexpr std::enable_if_t<arch_tag <= gpu_arch::Xe, uint32_t>
 get_store_cache_hint_code() {
     check_lsc_cache_hint<lsc_action::store, L1H, L2H, arch_tag>();
     if (L1H == cache_hint::none && L2H == cache_hint::none) {
diff --git a/include/common/utils/raw_send_nbarrier.hpp b/include/common/utils/raw_send_nbarrier.hpp
index fb7b92ee1..5050b9c51 100644
--- a/include/common/utils/raw_send_nbarrier.hpp
+++ b/include/common/utils/raw_send_nbarrier.hpp
@@ -107,14 +107,15 @@ struct xetla_nbarrier_t<num_producers, num_consumers, gpu_arch::Dg2> {
     /// @brief Generic work-group split barrier.
     ///
     __XETLA_API void arrive() {
-        __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::signal>();
+        // __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::signal>();
+        __ESIMD_NS::barrier();
     }
 
     /// @brief named barrier wait within subgroup.
     ///
     __XETLA_API void wait() {
-        __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::wait>();
-        // __ESIMD_NS::barrier();
+        // __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::wait>();
+        __ESIMD_NS::barrier();
     }
 
     /// @brief named barrier signal from subgroup.
diff --git a/include/group/gemm/compute_policy.hpp b/include/group/gemm/compute_policy.hpp
index 43d6b6bef..bceb3f339 100644
--- a/include/group/gemm/compute_policy.hpp
+++ b/include/group/gemm/compute_policy.hpp
@@ -31,19 +31,20 @@ namespace gpu::xetla::group {
 /// @tparam perf_tuning_knob_ Is performance-related knobs.
 /// @tparam arch_tag_ Is the HW architecture.
 template <typename compute_attr_, typename perf_tuning_knob_,
-        gpu_arch arch_tag_>
+        gpu_arch arch_tag_ = gpu_arch::Xe, typename enable = void>
 struct compute_policy_default_xmx {};
 
 /// @brief Specialized for Xe architecture.
-template <typename compute_attr_, typename perf_tuning_knob_>
-struct compute_policy_default_xmx<compute_attr_, perf_tuning_knob_,
-        gpu_arch::Xe> {
+template <typename compute_attr_, typename perf_tuning_knob_,
+        gpu_arch arch_tag_>
+struct compute_policy_default_xmx<compute_attr_, perf_tuning_knob_, arch_tag_,
+        std::enable_if_t<(arch_tag_ <= gpu_arch::Xe)>> {
     using compute_attr = compute_attr_;
     using perf_tuning_knob = perf_tuning_knob_;
     static constexpr int k_stride = perf_tuning_knob::k_stride;
     static constexpr int stages = perf_tuning_knob::stages;
     static constexpr int sync_freq = perf_tuning_knob::sync_freq;
-    static constexpr gpu_arch arch_tag = gpu_arch::Xe;
+    static constexpr gpu_arch arch_tag = arch_tag_;
     using dtype_mma_acc = typename compute_attr::dtype_acc;
     using dtype_mma_a = typename compute_attr::dtype_a;
     using dtype_mma_b = typename compute_attr::dtype_b;
@@ -53,7 +54,8 @@ struct compute_policy_default_xmx<compute_attr_, perf_tuning_knob_,
             = block_bytes_x_a / sizeof(dtype_mma_a);
     static constexpr uint32_t block_size_y_a = 16;
 
-    static constexpr uint32_t block_size_x_b = 16;
+    static constexpr uint32_t block_size_x_b = arch_tag < gpu_arch::Xe ? 8 : 16;
+
     static constexpr uint32_t block_bytes_y_b = 32;
     static constexpr uint32_t block_size_y_b
             = block_bytes_y_b / sizeof(dtype_mma_b);
diff --git a/include/group/gemm/impl/default_xmx_xe.hpp b/include/group/gemm/impl/default_xmx_xe.hpp
index 08d2d216f..c0d51e236 100644
--- a/include/group/gemm/impl/default_xmx_xe.hpp
+++ b/include/group/gemm/impl/default_xmx_xe.hpp
@@ -37,7 +37,7 @@ class gemm_t<
         mem_desc_a_t_, // memory attribute of matA
         mem_desc_b_t_, // memory attribute of matB
         pre_processing_t_, // pre_processing functor
-        std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag_ <= gpu_arch::Xe)>> {
 public:
     using mem_desc_a_t = mem_desc_a_t_;
     using mem_desc_b_t = mem_desc_b_t_;
diff --git a/include/kernel/gemm/default_gemm.hpp b/include/kernel/gemm/default_gemm.hpp
index 455eeeaf0..0b0584062 100644
--- a/include/kernel/gemm/default_gemm.hpp
+++ b/include/kernel/gemm/default_gemm.hpp
@@ -1,18 +1,18 @@
 /*******************************************************************************
-* Copyright (c) 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
+ * Copyright (c) 2022-2023 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
 
 /// @file
 /// C++ API
@@ -304,4 +304,4 @@ struct param_adaptor<param_adaptor_tag::work_group_epilogue, dict_t_> {
 
     using type = epilogue_t;
 };
-} // namespace gpu::xetla
\ No newline at end of file
+} // namespace gpu::xetla
diff --git a/include/kernel/gemm/impl/default_xe.hpp b/include/kernel/gemm/impl/default_xe.hpp
index 8f2da1cc3..93c949e09 100644
--- a/include/kernel/gemm/impl/default_xe.hpp
+++ b/include/kernel/gemm/impl/default_xe.hpp
@@ -35,7 +35,7 @@ namespace gpu::xetla::kernel {
 template <typename gemm_t_, typename epilogue_t_, typename group_swizzle_>
 class gemm_universal_t<dispatch_policy_default<group_swizzle_>, gemm_t_,
         epilogue_t_,
-        std::enable_if_t<(group_swizzle_::arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(group_swizzle_::arch_tag <= gpu_arch::Xe)>> {
     using gemm_t = gemm_t_;
     using epilogue_t = epilogue_t_;
     using gemm_args_t = typename gemm_t::arguments_t;
diff --git a/include/kernel/gemm/impl/kslicing_xe.hpp b/include/kernel/gemm/impl/kslicing_xe.hpp
index 595b072ea..87dd795f8 100644
--- a/include/kernel/gemm/impl/kslicing_xe.hpp
+++ b/include/kernel/gemm/impl/kslicing_xe.hpp
@@ -39,7 +39,7 @@ template <int num_global_kslicing_, int num_local_kslicing_, typename gemm_t_,
 class gemm_universal_t<dispatch_policy_kslicing<group_swizzle_,
                                num_global_kslicing_, num_local_kslicing_>,
         gemm_t_, epilogue_t_,
-        std::enable_if_t<(group_swizzle_::arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(group_swizzle_::arch_tag <= gpu_arch::Xe)>> {
     using gemm_t = gemm_t_;
     using epilogue_t = epilogue_t_;
     using gemm_args_t = typename gemm_t::arguments_t;
diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp
index 7a518899b..60fc4ee85 100644
--- a/include/subgroup/tile/impl/payload_xe.hpp
+++ b/include/subgroup/tile/impl/payload_xe.hpp
@@ -590,7 +590,7 @@ template <typename dtype_, typename tile_desc_, mem_layout mem_layout_,
 struct mem_payload_t<
         mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>,
         tile_desc_, msg_type::unaligned_2d, arch_tag_,
-        std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag_ <= gpu_arch::Xe)>> {
     using dtype = dtype_;
     using mem_desc_t
             = mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
@@ -1499,7 +1499,8 @@ struct prefetch_payload_t<
         tile_desc_t<tile_size_x_, tile_size_y_, block_size_x_, block_size_y_,
                 reg_layout_>,
         num_coop_sg_, arch_tag_,
-        std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag_ == gpu_arch::Xe)
+                && (tile_size_y_ != 1 || block_size_y_ != 1)>> {
     using dtype = dtype_;
     using mem_desc_t
             = mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
@@ -1665,7 +1666,7 @@ struct prefetch_payload_t<
         mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>,
         tile_desc_t<tile_size_x_, 1, block_size_x_, 1, reg_layout_>,
         num_coop_sg_, arch_tag_,
-        std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag_ <= gpu_arch::Xe)>> {
     using dtype = dtype_;
     using mem_desc_t
             = mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
@@ -1763,7 +1764,7 @@ template <typename dtype_, typename tile_desc_, mem_layout mem_layout_,
 struct prefetch_payload_t<
         mem_desc_t<dtype_, mem_layout_, mem_space::local, alignment_>,
         tile_desc_, num_coop_sg_, arch_tag_,
-        std::enable_if_t<(arch_tag_ == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag_ <= gpu_arch::Xe)>> {
     using dtype = dtype_;
     using mem_desc_t
             = mem_desc_t<dtype_, mem_layout_, mem_space::local, alignment_>;
diff --git a/include/subgroup/tile/impl/prefetch_xe.hpp b/include/subgroup/tile/impl/prefetch_xe.hpp
index 9205c68a3..a67400987 100644
--- a/include/subgroup/tile/impl/prefetch_xe.hpp
+++ b/include/subgroup/tile/impl/prefetch_xe.hpp
@@ -45,7 +45,7 @@ struct check_prefetch_type {
 
     static constexpr bool is_local_xe
             = ((payload_t::memory_space == mem_space::local)
-                    && (payload_t::arch_tag == gpu_arch::Xe));
+                    && (payload_t::arch_tag <= gpu_arch::Xe));
 };
 
 } // namespace detail
diff --git a/include/subgroup/tile/impl/tile_op_functor.hpp b/include/subgroup/tile/impl/tile_op_functor.hpp
index 379bd062d..866aabc8d 100644
--- a/include/subgroup/tile/impl/tile_op_functor.hpp
+++ b/include/subgroup/tile/impl/tile_op_functor.hpp
@@ -177,7 +177,7 @@ struct gelu_fwd_w_op_t {};
 /// @brief Is the element-wise gelu training forward op functor, specialized for Xe architecture.
 template <typename dtype_out_, gpu_arch arch_tag>
 struct gelu_fwd_w_op_t<dtype_out_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using dtype_out = dtype_out_;
     using mem_desc_w_t
             = mem_desc_t<dtype_out, mem_layout::row_major, mem_space::global>;
@@ -295,7 +295,7 @@ struct gelu_bwd_op_t {};
 /// @brief Is the element-wise gelu backward op functor, specialized for Xe architecture.
 template <typename dtype_in_, gpu_arch arch_tag>
 struct gelu_bwd_op_t<dtype_in_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using dtype_in = dtype_in_;
     using mem_desc_x_t
             = mem_desc_t<dtype_in, mem_layout::row_major, mem_space::global>;
@@ -490,7 +490,7 @@ struct scale_v_offset_v_op_t {};
 /// @brief Is the scale_v_offset_v op functor, specialized for Xe architecture.
 template <typename scale_dtype_, typename offset_dtype_, gpu_arch arch_tag>
 struct scale_v_offset_v_op_t<scale_dtype_, offset_dtype_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using scale_dtype = scale_dtype_;
     using offset_dtype = offset_dtype_;
 
@@ -619,7 +619,7 @@ struct scale_v_op_t {};
 /// @brief Is the scale_v op functor, specialized for Xe architecture.
 template <typename scale_dtype_, gpu_arch arch_tag>
 struct scale_v_op_t<scale_dtype_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using scale_dtype = scale_dtype_;
 
     using scale_mem_desc_t
@@ -933,7 +933,7 @@ struct dropout_op_t {};
 /// @brief Is the dropout op functor, specialized for Xe architecture.
 template <typename dtype_mask_, gpu_arch arch_tag>
 struct dropout_op_t<dtype_mask_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using dtype_mask = dtype_mask_;
     using mem_desc_mask_t
             = mem_desc_t<dtype_mask, mem_layout::row_major, mem_space::global>;
@@ -1010,7 +1010,7 @@ struct rng_dropout_op_t {};
 /// @brief Is the random number generator and dropout op functor, specialized for Xe architecture.
 template <typename dtype_mask_, gpu_arch arch_tag>
 struct rng_dropout_op_t<dtype_mask_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using dtype_mask = dtype_mask_;
     using mem_desc_mask_t
             = mem_desc_t<dtype_mask, mem_layout::row_major, mem_space::global>;
@@ -1114,7 +1114,7 @@ struct scalar_mul_op_t {};
 /// @brief Is the scalar_multiply op functor, specialized for Xe architecture.
 template <typename dtype_in_, gpu_arch arch_tag>
 struct scalar_mul_op_t<dtype_in_, arch_tag,
-        std::enable_if_t<(arch_tag == gpu_arch::Xe)>> {
+        std::enable_if_t<(arch_tag <= gpu_arch::Xe)>> {
     using dtype_in = dtype_in_;
     using mem_desc_in_t
             = mem_desc_t<dtype_in, mem_layout::row_major, mem_space::global>;
diff --git a/tests/integration/gemm/fp16/common.hpp b/tests/integration/gemm/fp16/common.hpp
index 01da3e8a8..8a6ceaf16 100644
--- a/tests/integration/gemm/fp16/common.hpp
+++ b/tests/integration/gemm/fp16/common.hpp
@@ -41,6 +41,26 @@ class TestBase {
     static constexpr mma_engine engine = mma_engine::xmx;
 };
 
+class Test : public TestBase {
+public:
+    static constexpr size_t mat_m = 256;
+    static constexpr size_t mat_n = 256;
+    static constexpr size_t mat_k = 256;
+    static constexpr size_t wg_m = 8;
+    static constexpr size_t wg_n = 32;
+    static constexpr size_t sg_m = 8;
+    static constexpr size_t sg_n = 16;
+    static constexpr size_t sg_k = 32;
+    static constexpr uint32_t global_kslicing = 1;
+    static constexpr uint32_t local_kslicing = 1;
+    static constexpr mem_layout layout_a = mem_layout::row_major;
+    static constexpr mem_layout layout_b = mem_layout::row_major;
+    using data_type_a = fp16;
+    using data_type_b = fp16;
+    using data_type_c = fp16;
+    using data_type_acc = float;
+};
+
 class Test0 : public TestBase {
 public:
     static constexpr size_t mat_m = 256;
diff --git a/tests/integration/gemm/fp16/kernel_func.hpp b/tests/integration/gemm/fp16/kernel_func.hpp
index 98fdb9572..adaef295d 100644
--- a/tests/integration/gemm/fp16/kernel_func.hpp
+++ b/tests/integration/gemm/fp16/kernel_func.hpp
@@ -29,8 +29,8 @@ template <typename dtype_a, typename dtype_b, typename dtype_c,
         uint32_t global_kslicing, uint32_t local_kslicing, mma_engine engine>
 struct fp16_gemm_test_func {
     using tile_shape = tile_shape_t<wg_n, wg_m, sg_n, sg_m>;
-    static constexpr uint32_t periodic_sync_interval = 8;
-    static constexpr uint32_t prefetch_distance = 3;
+    static constexpr uint32_t periodic_sync_interval = 0;
+    static constexpr uint32_t prefetch_distance = 0;
 
     using compute_attr = typename std::conditional<(engine == mma_engine::fpu),
             compute_attr_t<dtype_acc, dtype_acc, dtype_acc>,
@@ -40,9 +40,9 @@ struct fp16_gemm_test_func {
     using compute_policy =
             typename std::conditional<(engine == mma_engine::fpu),
                     compute_policy_default_fpu<compute_attr, perf_tuning_knob,
-                            gpu_arch::Xe>,
+                            gpu_arch::Dg2>,
                     compute_policy_default_xmx<compute_attr, perf_tuning_knob,
-                            gpu_arch::Xe>>::type;
+                            gpu_arch::Dg2>>::type;
 
     using mem_desc_input_a = mem_desc_t<dtype_a, layout_a, mem_space::global>;
     using mem_desc_input_b = mem_desc_t<dtype_b, layout_b, mem_space::global>;
@@ -52,11 +52,11 @@ struct fp16_gemm_test_func {
     using gemm_t = gemm_t<compute_policy, tile_shape, mem_desc_input_a,
             mem_desc_input_b>;
 
-    using epilogue_t = epilogue_t<epilogue_policy_default<gpu_arch::Xe>,
+    using epilogue_t = epilogue_t<epilogue_policy_default<gpu_arch::Dg2>,
             tile_shape, mem_desc_output_c>;
 
     using group_swizzle
-            = gpu::xetla::kernel::group_swizzle_default<gpu_arch::Xe>;
+            = gpu::xetla::kernel::group_swizzle_default<gpu_arch::Dg2>;
 
     using dispatch_policy = dispatch_policy_kslicing<group_swizzle,
             global_kslicing, local_kslicing>;
diff --git a/tests/integration/gemm/fp16/main.cpp b/tests/integration/gemm/fp16/main.cpp
index fdc579917..cc1c07b3a 100644
--- a/tests/integration/gemm/fp16/main.cpp
+++ b/tests/integration/gemm/fp16/main.cpp
@@ -28,8 +28,8 @@ template <typename T>
 class fp16_gemm_test : public ::testing::Test {};
 TYPED_TEST_SUITE_P(fp16_gemm_test);
 TYPED_TEST_P(fp16_gemm_test, esimd) {
-    gemm_exec<TypeParam, result_validate<TypeParam>, fp16_gemm_func<TypeParam>>(
-            esimd_compile_string);
+    gemm_exec<TypeParam, result_validate<TypeParam>, fp16_gemm_func<TypeParam>(
+        esimd_compile_string);
 }
 REGISTER_TYPED_TEST_SUITE_P(fp16_gemm_test, esimd);
 using tests = ::testing::Types<Test0, Test1, Test2, Test3, Test4, Test5, Test6,
diff --git a/tests/unit/tile_load_store/main.cpp b/tests/unit/tile_load_store/main.cpp
index fcfc6a3c3..82f3eebba 100644
--- a/tests/unit/tile_load_store/main.cpp
+++ b/tests/unit/tile_load_store/main.cpp
@@ -161,8 +161,9 @@ TEST(tile_load_store, esimd) {
     cl::sycl::nd_range<1> nd_range({1}, {1});
     auto result_validate = std::bind(tile_load_store_result_validate<int>, _1,
             _2, _3, 128, 64, 32, 32, 0);
-    kernel_run<int, tile_load_store_func<int, 128, 64, 128, 32, 32, 16, 16>>(
-            nd_range, result_validate);
+    kernel_run<int,
+            tile_load_store_func<int, 128, 64, 128, 32, 32, 16, 16, false,
+                    false, 128, gpu_arch::Dg2>>(nd_range, result_validate);
 }
 
 TEST(tile_load_transpose_store_1, esimd) {
@@ -266,8 +267,8 @@ TEST(tile_load_store_unaligned_2d, esimd) {
     auto result_validate = std::bind(tile_load_store_result_validate<date_type>,
             _1, _2, _3, 127, 63, 32, 32, 0);
     kernel_run<date_type,
-            tile_load_store_unaligned_2d_func<date_type, 127, 63, 127, 32, 32, 16,
-                    16>>(nd_range, result_validate);
+            tile_load_store_unaligned_2d_func<date_type, 127, 63, 127, 32, 32,
+                    16, 16>>(nd_range, result_validate);
 }
 
 TEST(tile_load_store_oob_1, esimd) {

From eafab9e3b4f4187b4cea131366aa19397de5a30a Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Tue, 27 Feb 2024 08:39:58 +0000
Subject: [PATCH 02/11] update 2024.1

---
 CMakeLists.txt                                |   1 +
 _clang-format                                 | 122 ------------------
 examples/01_gemm_universal/gemm_universal.cpp |   8 +-
 include/common/common.hpp                     |   6 -
 include/group/gemm/compute_policy.hpp         |  12 +-
 include/kernel/gemm/default_gemm.hpp          |  28 ++--
 include/subgroup/tile/impl/payload_xe.hpp     |   3 +-
 tests/integration/gemm/fp16/main.cpp          |   6 +-
 8 files changed, 31 insertions(+), 155 deletions(-)
 delete mode 100644 _clang-format

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b46837d6..ebfee8f09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,7 @@ else() # Windows
 endif() 
 
 project(XeTLA)
+
 include(CTest)
 enable_testing()
 
diff --git a/_clang-format b/_clang-format
deleted file mode 100644
index eee0a4ee7..000000000
--- a/_clang-format
+++ /dev/null
@@ -1,122 +0,0 @@
-#===============================================================================
-# Copyright 2016-2019 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
----
-Language:        Cpp
-AccessModifierOffset: -4
-AlignAfterOpenBracket: DontAlign
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: DontAlign
-AlignOperands:   false
-AlignTrailingComments: false
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: All
-BreakBeforeBraces: Custom
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeComma
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 8
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-FixNamespaceComments: true
-ForEachMacros:
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex: '<[[:alnum:].]+>'
-    Priority: 0
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: true
-# IndentPPDirectives: AfterHash
-IndentPPDirectives: None
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: true
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Right
-ReflowComments:  false
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: true
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: false
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-StatementMacros:
-  - for_
-  - PRAGMA_OMP
-  - PRAGMA_OMP_SIMD
-TabWidth:        4
-UseTab:          Never
-...
-# vim:ft=conf et ts=2 sw=2
diff --git a/examples/01_gemm_universal/gemm_universal.cpp b/examples/01_gemm_universal/gemm_universal.cpp
index ed566cea3..5144c2e16 100644
--- a/examples/01_gemm_universal/gemm_universal.cpp
+++ b/examples/01_gemm_universal/gemm_universal.cpp
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 enum class kslicing_impl_t : uint8_t { none = 0, global = 1, local = 2 };
 
@@ -33,9 +33,9 @@ void gemm_universal_run(uint32_t iter) {
     size_t size_b = matrix_k * matrix_n;
     size_t size_c = matrix_m * matrix_n;
 
-    using data_type_a = fp16;
-    using data_type_b = fp16;
-    using data_type_c = fp16;
+    using data_type_a = bf16;
+    using data_type_b = bf16;
+    using data_type_c = bf16;
     using data_type_acc = float;
 
     //Turn on the profiling property to facilitate subsequent profiling
diff --git a/include/common/common.hpp b/include/common/common.hpp
index cccc09bbb..97c831c4b 100644
--- a/include/common/common.hpp
+++ b/include/common/common.hpp
@@ -21,9 +21,3 @@
 
 #include <common/core/core.hpp>
 #include <common/utils/utils.hpp>
-
-#ifdef __SYCL_DEVICE_ONLY__
-#define CONSTANT __attribute__((opencl_constant))
-#else
-#define CONSTANT
-#endif
diff --git a/include/group/gemm/compute_policy.hpp b/include/group/gemm/compute_policy.hpp
index bceb3f339..d22c45829 100644
--- a/include/group/gemm/compute_policy.hpp
+++ b/include/group/gemm/compute_policy.hpp
@@ -105,19 +105,21 @@ struct compute_policy_unaligned_xmx<compute_attr_, perf_tuning_knob_, arch_tag_,
 /// @tparam perf_tuning_knob_ Is performance-related knobs.
 /// @tparam arch_tag_ Is the HW architecture.
 template <typename compute_attr_, typename perf_tuning_knob_,
-        gpu_arch arch_tag_>
+        gpu_arch arch_tag_ = gpu_arch::Xe, typename enable = void>
 struct compute_policy_default_fpu {};
 
 /// @brief Specialized for Xe architecture.
-template <typename compute_attr_, typename perf_tuning_knob_>
-struct compute_policy_default_fpu<compute_attr_, perf_tuning_knob_,
-        gpu_arch::Xe> {
+template <typename compute_attr_, typename perf_tuning_knob_,
+        gpu_arch arch_tag_>
+struct compute_policy_default_fpu<compute_attr_, perf_tuning_knob_, arch_tag_,
+        std::enable_if_t<(arch_tag_ <= gpu_arch::Xe)>> {
     using compute_attr = compute_attr_;
     using perf_tuning_knob = perf_tuning_knob_;
     static constexpr int k_stride = perf_tuning_knob::k_stride;
     static constexpr int stages = perf_tuning_knob::stages;
     static constexpr int sync_freq = perf_tuning_knob::sync_freq;
-    static constexpr gpu_arch arch_tag = gpu_arch::Xe;
+    static constexpr gpu_arch arch_tag = arch_tag_;
+
     using dtype_mma_acc = typename compute_attr::dtype_acc;
     using dtype_mma_a = typename compute_attr::dtype_a;
     using dtype_mma_b = typename compute_attr::dtype_b;
diff --git a/include/kernel/gemm/default_gemm.hpp b/include/kernel/gemm/default_gemm.hpp
index 0b0584062..63625f4c8 100644
--- a/include/kernel/gemm/default_gemm.hpp
+++ b/include/kernel/gemm/default_gemm.hpp
@@ -1,18 +1,18 @@
 /*******************************************************************************
- * Copyright (c) 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
+* Copyright (c) 2022-2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
 
 /// @file
 /// C++ API
diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp
index 60fc4ee85..302167448 100644
--- a/include/subgroup/tile/impl/payload_xe.hpp
+++ b/include/subgroup/tile/impl/payload_xe.hpp
@@ -1308,7 +1308,8 @@ struct prefetch_payload_t<
         tile_desc_t<tile_size_x_, tile_size_y_, block_size_x_, block_size_y_,
                 reg_layout_>,
         num_coop_sg_, arch_tag_,
-        std::enable_if_t<(arch_tag_ == gpu_arch::Dg2)>> {
+        std::enable_if_t<(arch_tag_ == gpu_arch::Dg2
+                && (tile_size_y_ != 1 || block_size_y_ != 1))>> {
     using dtype = dtype_;
     using mem_desc_t
             = mem_desc_t<dtype_, mem_layout_, mem_space::global, alignment_>;
diff --git a/tests/integration/gemm/fp16/main.cpp b/tests/integration/gemm/fp16/main.cpp
index cc1c07b3a..6e52131dd 100644
--- a/tests/integration/gemm/fp16/main.cpp
+++ b/tests/integration/gemm/fp16/main.cpp
@@ -16,8 +16,8 @@
 
 #include "common.hpp"
 #include "kernel_func.hpp"
-#include <utils/utils.hpp>
 #include <gtest/gtest.h>
+#include <utils/utils.hpp>
 
 std::string esimd_compile_string
         = " -vc-codegen -doubleGRF "
@@ -28,8 +28,8 @@ template <typename T>
 class fp16_gemm_test : public ::testing::Test {};
 TYPED_TEST_SUITE_P(fp16_gemm_test);
 TYPED_TEST_P(fp16_gemm_test, esimd) {
-    gemm_exec<TypeParam, result_validate<TypeParam>, fp16_gemm_func<TypeParam>(
-        esimd_compile_string);
+    gemm_exec<TypeParam, result_validate<TypeParam>, fp16_gemm_func<TypeParam>>(
+            esimd_compile_string);
 }
 REGISTER_TYPED_TEST_SUITE_P(fp16_gemm_test, esimd);
 using tests = ::testing::Types<Test0, Test1, Test2, Test3, Test4, Test5, Test6,

From e1cc7ad1975823afc4b28fff8d7a81e17b6e8fea Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Tue, 27 Feb 2024 08:44:35 +0000
Subject: [PATCH 03/11] using instead of deriving

---
 include/kernel/gemm/default_gemm.hpp | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/kernel/gemm/default_gemm.hpp b/include/kernel/gemm/default_gemm.hpp
index 63625f4c8..502b28076 100644
--- a/include/kernel/gemm/default_gemm.hpp
+++ b/include/kernel/gemm/default_gemm.hpp
@@ -61,10 +61,9 @@ template <typename dtype_a, mem_layout mem_layout_a, uint32_t alignment_a,
         typename dtype_c, mem_layout mem_layout_c, uint32_t alignment_c,
         typename dtype_acc, gpu_arch gpu_arch_tag = gpu_arch::Xe,
         typename tune_option = dict_t<>>
-struct default_gemm_t
-    : default_gemm_config_t<dtype_a, mem_layout_a, alignment_a, dtype_b,
-              mem_layout_b, alignment_b, dtype_c, mem_layout_c, alignment_c,
-              dtype_acc, gpu_arch_tag, tune_option>::type {};
+using default_gemm_t = typename default_gemm_config_t<dtype_a, mem_layout_a,
+        alignment_a, dtype_b, mem_layout_b, alignment_b, dtype_c, mem_layout_c,
+        alignment_c, dtype_acc, gpu_arch_tag, tune_option>::type;
 } // namespace kernel
 
 template <typename dict_t_>
@@ -158,11 +157,10 @@ template <typename dtype_a, mem_layout mem_layout_a, uint32_t alignment_a,
         uint32_t alignment_b, mem_space mem_space_b, typename dtype_acc,
         typename wg_shape, uint32_t wg_tile_k,
         gpu_arch gpu_arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
-struct default_gemm_selector_t
-    : default_gemm_selector_config_t<dtype_a, mem_layout_a, alignment_a,
-              mem_space_a, dtype_b, mem_layout_b, alignment_b, mem_space_b,
-              dtype_acc, wg_shape, wg_tile_k, gpu_arch_tag, tune_option>::type {
-};
+using default_gemm_selector_t = typename default_gemm_selector_config_t<dtype_a,
+        mem_layout_a, alignment_a, mem_space_a, dtype_b, mem_layout_b,
+        alignment_b, mem_space_b, dtype_acc, wg_shape, wg_tile_k, gpu_arch_tag,
+        tune_option>::type;
 
 template <typename dtype_c, mem_layout mem_layout_c, uint32_t alignment_c,
         mem_space mem_space_c, typename wg_shape, uint32_t wg_tile_k,
@@ -188,10 +186,10 @@ struct default_epilogue_selector_config_t
 template <typename dtype_c, mem_layout mem_layout_c, uint32_t alignment_c,
         mem_space mem_space_c, typename wg_shape, uint32_t wg_tile_k,
         gpu_arch gpu_arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
-struct default_epilogue_selector_t
-    : default_epilogue_selector_config_t<dtype_c, mem_layout_c, alignment_c,
-              mem_space_c, wg_shape, wg_tile_k, gpu_arch_tag,
-              tune_option>::type {};
+using default_epilogue_selector_t =
+        typename default_epilogue_selector_config_t<dtype_c, mem_layout_c,
+                alignment_c, mem_space_c, wg_shape, wg_tile_k, gpu_arch_tag,
+                tune_option>::type;
 } // namespace group
 
 template <typename dict_t_>

From f53a39ba48c3fab3a2c23cd983d2cf0081afe86b Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Tue, 27 Feb 2024 08:55:06 +0000
Subject: [PATCH 04/11] reformat sdp

---
 .../scaled_dot_product_attention.cpp                          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
index 0060c299e..9c25fc9cc 100644
--- a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -232,8 +232,8 @@ void sdp_fwd_run(uint32_t iter) {
     cl::sycl::nd_range<3> nd_range(group_range * local_range, local_range);
 
     constexpr uint32_t warmup = 10;
-    int64_t ops = int64_t(4 * batch_num * head_num * sequence_len) * sequence_len
-            * head_size;
+    int64_t ops = int64_t(4 * batch_num * head_num * sequence_len)
+            * sequence_len * head_size;
     profiling_helper prof("sdp", ops, "gflops");
     try {
         for (uint32_t i = 0; i < iter + warmup; i++) {

From c96aef2989e8f80f99a429610e523859464367be Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Fri, 1 Mar 2024 16:50:17 +0000
Subject: [PATCH 05/11] Enable dg2 sdp exmaple

---
 .editorconfig                                 |  15 ++
 CMakeLists.txt                                |   4 +-
 .../scaled_dot_product_attention.cpp          | 198 ++++++++++--------
 .../softmax.hpp                               |  12 +-
 examples/CMakeLists.txt                       |   7 +-
 include/common/utils/raw_send_nbarrier.hpp    |  28 ++-
 include/group/gemm/compute_policy.hpp         |   2 -
 include/kernel/default_config/common.hpp      |  75 ++++---
 .../default_config/decision_tree_policy.hpp   |  72 +++----
 .../kernel/default_config/dummy_policy.hpp    |   4 +-
 include/kernel/gemm/default_gemm.hpp          |  10 +-
 include/kernel/gemm/gemm_preset.hpp           |   4 +-
 include/subgroup/tile/impl/payload_xe.hpp     |   2 +-
 tests/integration/gemm/fp16/common.hpp        |  22 +-
 tests/integration/gemm/fp16/kernel_func.hpp   |  12 +-
 tests/utils/execution.hpp                     | 103 ++++++++-
 16 files changed, 346 insertions(+), 224 deletions(-)
 create mode 100644 .editorconfig

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 000000000..d2f375aac
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,15 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+# C/C++ follows clang-format
+[*.{c,cpp,h,hpp}]
+indent_style = space
+indent_size = 4
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ebfee8f09..d305e5a65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,14 +9,14 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 if(UNIX)
 else() # Windows
-    # Force CMake to use icx-cl rather than the default C++ compiler/linker 
+    # Force CMake to use icx-cl rather than the default C++ compiler/linker
     # (needed on Windows only)
     # include (CMakeForceCompiler)
     # CMAKE_FORCE_CXX_COMPILER (icx-cl IntelDPCPP)
     set(CMAKE_CXX_COMPILER icx-cl)
     include (Platform/Windows-Clang)
     include(cmake/GTestExternal.cmake)
-endif() 
+endif()
 
 project(XeTLA)
 
diff --git a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
index 9c25fc9cc..420a854c4 100644
--- a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -135,10 +135,12 @@ int sdp_fwd_result_validate(dtype_in *q_device, dtype_in *k_device,
     return result ? 0 : 1;
 }
 
-void sdp_fwd_run(uint32_t iter) {
-    // Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
-    // Please make sure you fully understand these configurations before you do any modifications, incomplete changes may lead to unexpected behaviors.
-    // Please contact us for support.
+template <gpu_arch arch_tag>
+void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
+    // Tips, the example demonstrates programming kernel with XeTLA, it works as
+    // expected with current configurations. Please make sure you fully understand
+    // these configurations before you do any modifications, incomplete changes
+    // may lead to unexpected behaviors. Please contact us for support.
 
     using dtype_in = bf16;
     using dtype_out = bf16;
@@ -150,9 +152,11 @@ void sdp_fwd_run(uint32_t iter) {
     constexpr uint32_t matrix_n_qk = sequence_len;
     constexpr uint32_t matrix_k_qk = head_size;
 
-    constexpr uint32_t wg_tile_m_qk = 64;
-    constexpr uint32_t wg_tile_n_qk = 512;
-    constexpr uint32_t sg_tile_m_qk = 32;
+    constexpr uint32_t wg_tile_m_qksv = arch_tag == gpu_arch::Xe ? 64 : 32;
+
+    constexpr uint32_t wg_tile_m_qk = wg_tile_m_qksv;
+    constexpr uint32_t wg_tile_n_qk = 512; // must == sl_kv
+    constexpr uint32_t sg_tile_m_qk = arch_tag == gpu_arch::Xe ? 32 : 16;
     constexpr uint32_t sg_tile_n_qk = 32;
     constexpr uint32_t wg_tile_k_qk = 32;
 
@@ -161,10 +165,11 @@ void sdp_fwd_run(uint32_t iter) {
     constexpr uint32_t matrix_n_sv = head_size;
     constexpr uint32_t matrix_k_sv = sequence_len;
 
-    constexpr uint32_t wg_tile_m_sv = 64;
-    constexpr uint32_t wg_tile_n_sv = 64;
+    // constexpr uint32_t wg_tile_m_sv = 64;
+    constexpr uint32_t wg_tile_m_sv = wg_tile_m_qksv;
+    constexpr uint32_t wg_tile_n_sv = 64; // must == head_dim
     constexpr uint32_t sg_tile_m_sv = 8;
-    constexpr uint32_t sg_tile_n_sv = 16;
+    constexpr uint32_t sg_tile_n_sv = arch_tag == gpu_arch::Xe ? 16 : 8;
     constexpr uint32_t wg_tile_k_sv = 32;
 
     // buffer size of softmax row data
@@ -178,11 +183,12 @@ void sdp_fwd_run(uint32_t iter) {
     auto context = queue.get_info<info::queue::context>();
     auto device = queue.get_info<info::queue::device>();
 
-    std::cout << "Running on " << device.get_info<info::device::name>() << "\n";
+    print_device_details(device);
 
     constexpr uint32_t size_qkv = matrix_m_qk * matrix_k_qk;
     constexpr uint32_t size_mask = matrix_m_qk * matrix_n_qk;
     constexpr uint32_t size_out = matrix_m_sv * matrix_n_sv;
+    const float scale_qk = 1.f / std::sqrt(head_size);
 
     auto q = alloc_device_and_init<dtype_in>(
             batch_cnt * size_qkv,
@@ -220,6 +226,11 @@ void sdp_fwd_run(uint32_t iter) {
     constexpr uint32_t subgroup_range_m = wg_tile_m_qk / sg_tile_m_qk;
     constexpr uint32_t subgroup_range_n = wg_tile_n_qk / sg_tile_n_qk;
 
+    constexpr uint32_t slm_size
+            = wg_tile_m_qk * wg_tile_n_qk * sizeof(dtype_sfx);
+    XETLA_ASSERT(slm_size <= device.get_info<info::device::local_mem_size>(),
+            "SLM size too large!");
+
     static_assert(subgroup_range_m * subgroup_range_n == thread_num,
             "Given thread number should equal to pre-set value 32!");
     std::cout << "group_num_x: " << group_range_n
@@ -231,7 +242,6 @@ void sdp_fwd_run(uint32_t iter) {
     cl::sycl::range<3> local_range {1, subgroup_range_m, subgroup_range_n};
     cl::sycl::nd_range<3> nd_range(group_range * local_range, local_range);
 
-    constexpr uint32_t warmup = 10;
     int64_t ops = int64_t(4 * batch_num * head_num * sequence_len)
             * sequence_len * head_size;
     profiling_helper prof("sdp", ops, "gflops");
@@ -239,14 +249,13 @@ void sdp_fwd_run(uint32_t iter) {
         for (uint32_t i = 0; i < iter + warmup; i++) {
             if (i >= warmup) { prof.cpu_start(); }
             auto gpu_event = queue.submit([&](handler &cgh) {
-                cgh.parallel_for<
-                        class Test>(nd_range, [=](nd_item<3> item) KERNEL_MAIN {
+                cgh.parallel_for(nd_range, [=](nd_item<3> item) KERNEL_MAIN {
                     using namespace gpu::xetla;
                     using namespace gpu::xetla::group;
                     using namespace gpu::xetla::kernel;
                     using namespace gpu::xetla::subgroup;
 
-                    uint32_t batch_id = item.get_group(0);
+                    const uint32_t batch_id = item.get_group(0);
                     // disable sync in gemm
                     static constexpr uint32_t periodic_sync_interval = 0;
                     static constexpr uint32_t prefetch_distance = 3;
@@ -254,19 +263,23 @@ void sdp_fwd_run(uint32_t iter) {
                     using wg_shape0 = shape<wg_tile_n_qk, wg_tile_m_qk>;
                     using sg_shape0 = shape<sg_tile_n_qk, sg_tile_m_qk>;
 
-                    using post_op0_t = scalar_mul_op_t<float, gpu_arch::Dg2>;
+                    using post_op0_t = scalar_mul_op_t<float, arch_tag>;
                     using post_op1_t = elemwise_reduce_op_t<reduce_op::sum,
-                            dtype_in, gpu_arch::Dg2>;
+                            dtype_in, arch_tag>;
                     using post_op_t = chained_tile_op_t<post_op0_t, post_op1_t>;
                     using epilogue_policy0
                             = xetla::group::epilogue_policy_tile_op<post_op_t,
-                                    gpu_arch::Dg2>;
-                    using group_swizzle = group_swizzle_default<gpu_arch::Dg2>;
-
-                    using tune_option0 = dict_t<
-                            elem_v_t<tune_key::param_optimizer_type,
-                                    tune_key_value::
-                                            param_optimizer_decision_tree>,
+                                    arch_tag>;
+                    using group_swizzle = group_swizzle_default<arch_tag>;
+
+                    using elem_opt_mode_t
+                            = elem_v_t<tune_key::param_optimizer_mode,
+                                    param_optimizer_mode::keep_shape>;
+                    using elem_opt_type_t = elem_v_t<
+                            tune_key::param_optimizer_type,
+                            tune_key_value::param_optimizer_decision_tree>;
+                    using tune_option0 = dict_t< //
+                            elem_opt_type_t, elem_opt_mode_t,
                             elem_t_t<tune_key::epilogue_policy,
                                     epilogue_policy0>,
                             elem_t_t<tune_key::sg_tile_shape, sg_shape0>,
@@ -285,10 +298,10 @@ void sdp_fwd_run(uint32_t iter) {
                             8, // leading dimension for B, in unit of element
                             mem_space::
                                     global, // memory reading from global mem for B
-                            float, // accumulator data type for intermediate resutls
+                            float, // accumulator data type for intermediate results
                             wg_shape0, // computation tile shape
                             wg_tile_k_qk, // elements in each iteration
-                            gpu_arch::Dg2, // GPU arch
+                            arch_tag, // GPU arch
                             tune_option0>;
                     using epilogue0_t = xetla::group::default_epilogue_selector_t<
                             dtype_sfx, // onput datatype for C
@@ -298,7 +311,7 @@ void sdp_fwd_run(uint32_t iter) {
                                     local, // memory writing to local mem for C
                             wg_shape0, // computation tile shape
                             wg_tile_k_qk, // elements in each iteration
-                            gpu_arch::Dg2, // GPU arch
+                            arch_tag, // GPU arch
                             tune_option0>;
                     using gemm_op0_t = gemm_universal_t<
                             dispatch_policy_default<group_swizzle>, gemm0_t,
@@ -307,29 +320,27 @@ void sdp_fwd_run(uint32_t iter) {
                     using tile_shape0 = typename gemm0_t::tile_shape;
 
                     // initialize SLM size
-                    constexpr uint32_t slm_size
-                            = wg_tile_m_qk * wg_tile_n_qk * sizeof(dtype_sfx);
                     xetla_local_init<slm_size>();
 
                     // initialize named barrier count
                     // we only need to do thread sync while store gemm results to SLM
                     // one barrier is enough for that
                     xetla_nbarrier_init<1>();
-                    xetla_nbarrier_t<thread_num, thread_num, gpu_arch::Dg2>
-                            nbarrier;
+                    xetla_nbarrier_t<thread_num, thread_num, arch_tag> nbarrier;
                     nbarrier.init_nbarrier(0, nbarrier_role::producer_consumer);
 
                     // initialize gemm op: gemm result store to shared local memory
-                    typename post_op0_t::arguments_t post_op0_arg(0.125);
+                    typename post_op0_t::arguments_t post_op0_arg(scale_qk);
                     typename post_op1_t::arguments_t post_op1_arg(
+                            // attn_mask pre-load ptr batch offset
                             attn_mask + batch_id / head_num * size_mask
                                     + wg_tile_m_qk * wg_tile_n_qk
-                                            * item.get_group(
-                                                    1), // attn_mask pre-load ptr batch offset
-                            {matrix_n_qk, // attn_mask tdesc width
+                                            * item.get_group(1),
+                            {
+                                    matrix_n_qk, // attn_mask tdesc width
                                     matrix_m_qk, // attn_mask tdesc height
-                                    matrix_n_qk} // attn_mask tdesc pitch
-                    );
+                                    matrix_n_qk, // attn_mask tdesc pitch
+                            });
                     typename gemm_op0_t::arguments_t arg0(matrix_m_qk,
                             matrix_k_qk, matrix_n_qk,
                             q + batch_id * size_qkv, // matA_ptr + batch offset
@@ -339,22 +350,20 @@ void sdp_fwd_run(uint32_t iter) {
                             0, // matC_base
                             matrix_n_qk, // matC load width
                             {{post_op0_arg, post_op1_arg}});
-                    gemm_op0_t gemm_op0;
-                    gemm_op0(item, arg0);
+                    gemm_op0_t {}(item, arg0);
                     xetla_fence<memory_kind::shared_local>();
                     nbarrier.arrive_wait();
 
                     // softmax start: result store to SLM
                     using softmax_op_t = xetla_softmax_fwd_t<dtype_sfx,
                             dtype_in, tile_shape0, mem_space::local,
-                            mem_space::local, SIMD, thread_num, softmax_sz>;
+                            mem_space::local, SIMD, thread_num, softmax_sz,
+                            arch_tag>;
                     typename softmax_op_t::arguments_t arg1;
-                    softmax_op_t softmax_op;
-
                     arg1.data_in_base = 0;
                     arg1.data_out_base = 0;
 
-                    softmax_op(item, &arg1);
+                    softmax_op_t {}(item, &arg1);
                     xetla_fence<memory_kind::shared_local>();
                     nbarrier.arrive_wait();
 
@@ -362,10 +371,8 @@ void sdp_fwd_run(uint32_t iter) {
                     using wg_shape1 = shape<wg_tile_n_sv, wg_tile_m_sv>;
                     using sg_shape1 = shape<sg_tile_n_sv, sg_tile_m_sv>;
 
-                    using tune_option1 = dict_t<
-                            elem_v_t<tune_key::param_optimizer_type,
-                                    tune_key_value::
-                                            param_optimizer_decision_tree>,
+                    using tune_option1 = dict_t< //
+                            elem_opt_type_t, elem_opt_mode_t,
                             elem_t_t<tune_key::sg_tile_shape, sg_shape1>,
                             elem_v_t<tune_key::prefetch_distance,
                                     prefetch_distance>,
@@ -383,10 +390,10 @@ void sdp_fwd_run(uint32_t iter) {
                             8, // leading dimension for B, in unit of element
                             mem_space::
                                     global, // memory reading from global mem for B
-                            float, // accumulator data type for intermediate resutls
+                            float, // accumulator data type for intermediate results
                             wg_shape1, // computation tile shape
                             wg_tile_k_sv, // elements in each iteration
-                            gpu_arch::Dg2, // GPU arch
+                            arch_tag, // GPU arch
                             tune_option1>;
 
                     // gemm arguments include matA & matB load information and
@@ -395,6 +402,8 @@ void sdp_fwd_run(uint32_t iter) {
                     using work_group_t = typename gemm1_t::work_group_t;
                     using mem_desc_a_t = typename gemm1_t::mem_desc_a_t;
                     using mem_desc_b_t = typename gemm1_t::mem_desc_b_t;
+                    using mem_desc_c_t = mem_desc_t<dtype_out,
+                            mem_layout::row_major, mem_space::global>;
                     // Using gemm::matAcc init a matC class for future storage
                     using matAcc_t = typename gemm1_t::matAcc_t;
                     using matC_t = tile_t<dtype_out,
@@ -416,9 +425,8 @@ void sdp_fwd_run(uint32_t iter) {
                     int start_m = item.get_group(1) * wg_tile_m_sv;
                     int start_k = 0;
                     uint32_t wg_tile_k = matrix_k;
-                    uint32_t boundary_n = (start_n + wg_tile_n_sv) > matrix_n
-                            ? matrix_n
-                            : (start_n + wg_tile_n_sv);
+                    uint32_t boundary_n
+                            = std::min(start_n + wg_tile_n_sv, matrix_n);
                     uint32_t boundary_k = wg_tile_k;
 
                     work_group_t g;
@@ -431,42 +439,45 @@ void sdp_fwd_run(uint32_t iter) {
                     mem_desc_b.init(matB_ptr, {boundary_n, boundary_k, matB_ld},
                             {start_n, start_k});
 
-                    uint32_t inner_loop_count
+                    uint32_t sg_k_count
                             = (wg_tile_k + wg_tile_k_sv - 1) / wg_tile_k_sv;
-                    gemm_args_t gemm_args(
-                            mem_desc_a, mem_desc_b, inner_loop_count);
+                    gemm_args_t gemm_args(mem_desc_a, mem_desc_b, sg_k_count);
                     matAcc_t matAcc;
-                    matC_t matC;
-                    gemm1_t gemm;
 
                     matAcc.init(0);
-                    gemm(g, matAcc, gemm_args);
+                    gemm1_t {}(g, matAcc, gemm_args);
+
                     // permute store
+                    matC_t matC;
                     subgroup::elemwise_cvt<matC_t, matAcc_t>(matC, matAcc);
-                    xetla_tdescriptor transpose_tdecs;
-                    // Define a temprary vector as output buffer
-                    xetla_vector<dtype_out, sg_tile_n_sv> out_reg;
                     // Calculate new coordination of each element
-                    uint32_t b = item.get_group(0) / head_num;
-                    uint32_t n = item.get_group(0) % head_num;
-                    uint32_t f = start_m + gemm1_t::get_matC_offset_y(g);
-                    uint32_t h = start_n + gemm1_t::get_matC_offset_x(g);
-
-                    // transpose 8 * 16 tile and store to global
-                    for (uint32_t j = 0; j < sg_tile_m_sv; ++j, ++f) {
-                        uint32_t dst_offset
-                                = b * head_num * sequence_len * head_size
-                                + f * head_num * head_size + n * head_size;
-                        out_reg = matC.reg.xetla_select<sg_tile_n_sv, 1>(
-                                j * sg_tile_n_sv);
-                        xetla_fill_tdesc<dtype_out, sg_tile_n_sv, 1, 1>(
-                                transpose_tdecs.xetla_format<uint32_t>(),
-                                out + dst_offset, head_size, 1, head_size, h,
-                                0);
-                        xetla_tstore_global<dtype_out, sg_tile_n_sv,
-                                cache_hint::write_back, cache_hint::write_back,
-                                gpu_arch::Dg2>(transpose_tdecs, out_reg);
-                    }
+                    const uint32_t b = batch_id / head_num;
+                    const uint32_t n = batch_id % head_num;
+                    const uint32_t batch_offset
+                            = b * head_num * sequence_len * head_size
+                            + start_m * head_num * head_size + n * head_size
+                            + start_n;
+                    const uint32_t f = gemm1_t::get_matC_offset_y(g);
+                    const uint32_t h = gemm1_t::get_matC_offset_x(g);
+
+                    const auto ld_c = head_num * head_size;
+                    mem_desc_c_t mem_desc_c;
+                    mem_desc_c.init(
+                            out + batch_offset, // dst_base = out_ptr + wg offset
+                            {
+                                    std::min(h + sg_tile_n_sv, wg_tile_n_sv),
+                                    std::min(f + sg_tile_m_sv, wg_tile_m_sv),
+                                    ld_c,
+                            },
+                            {int(h), int(f)});
+
+                    constexpr auto msg_type_c = msg_type::block_2d;
+                    using mat_tile_desc = typename matC_t::tile_desc;
+                    using matC_payload_t = subgroup::mem_payload_t<mem_desc_c_t,
+                            mat_tile_desc, msg_type_c, arch_tag>;
+                    matC_payload_t matC_payload(mem_desc_c);
+                    subgroup::tile_store<cache_hint::write_back,
+                            cache_hint::write_back>(matC, matC_payload);
                 });
             });
             gpu_event.wait();
@@ -488,7 +499,7 @@ void sdp_fwd_run(uint32_t iter) {
                     mem_layout::col_major, mem_layout::row_major,
                     mem_layout::row_major));
 
-    //performance
+    // performance
     prof.print_profiling_result(profiling_selector::GPU);
 
     free(q, context);
@@ -498,28 +509,41 @@ void sdp_fwd_run(uint32_t iter) {
     free(out, context);
 }
 
+template <gpu_arch arch_tag>
+struct main_wrapper {
+    static constexpr auto exec = []() {
+        if constexpr (arch_tag == gpu_arch::Dg2) {
+            sdp_fwd_run<arch_tag>(10);
+        } else {
+            sdp_fwd_run<arch_tag>(10);
+        }
+    };
+};
+
 int main() {
     // This example implements scaled-dot-production with batch_size: 16,
-    // num_heads: 16, sequence_lenth: 512, head_size: 64. It will be shown how to
+    // num_heads: 16, sequence_length: 512, head_size: 64. It will be shown how to
     // remap the index space of each work-item used for gemm1, softmax and gemm2.
 
     // Description:
-    // Scaled-dot-production mechanism can be seen as two chained batch MatMul with
-    // a softmax in the middle layer. It can be descripted as following
+    // Scaled-dot-production mechanism can be seen as two chained batch MatMul
+    // with a softmax in the middle layer. It can be described as following
     // mathematical expression:
-    //   softmax(Q · (K.transpose(-1, -2)) * (1 / sqr_root(num_heads)) + attn_mask) · V
+    //   softmax(Q · (K.transpose(-1, -2)) * (1 / sqr_root(num_heads)) +
+    //   attn_mask) · V
     // where:
     //   Q, K, V: input data
     //   shape(Q) = [16 x 16, 512, 64]
     //   shape(K) = [16 x 16, 512, 64]
     //   shape(V) = [16 x 16, 512, 64]
     //   shape(attn_mask) = [16, 512, 512]
+    //   shape(DST) = [16, 512, 16, 64]
 
     // This kernel is designed to execute the following task:
     // 1: S = (Q · (K.transpose(-1, -2))) * (1 / sqr_root(num_heads)) + attn_mask
     // 2: S' = softmax(S)
     // 3: O = S' · V
 
-    sdp_fwd_run(10);
+    dispatch_arch<main_wrapper>::exec();
     return 0;
 }
diff --git a/examples/08_scaled_dot_product_attention/softmax.hpp b/examples/08_scaled_dot_product_attention/softmax.hpp
index 58fb1c688..0fc04b8aa 100644
--- a/examples/08_scaled_dot_product_attention/softmax.hpp
+++ b/examples/08_scaled_dot_product_attention/softmax.hpp
@@ -24,7 +24,7 @@ using namespace gpu::xetla::subgroup;
 
 template <typename dtype_in_, typename dtype_out_, typename tile_shape_,
         mem_space mem_space_in_, mem_space mem_space_out_, uint32_t SIMD_,
-        uint32_t thread_num_, uint32_t softmax_size_>
+        uint32_t thread_num_, uint32_t softmax_size_, gpu_arch arch_tag>
 struct xetla_softmax_fwd_t {
     using dtype_in = dtype_in_;
     using dtype_out = dtype_out_;
@@ -56,16 +56,14 @@ struct xetla_softmax_fwd_t {
     using softmax_load_payload_t = subgroup::mem_payload_t<
             mem_desc_t<dtype_in, mem_layout::row_major, mem_space_in>,
             softmax_tile_desc_t,
-            subgroup::msg_type_v<softmax_tile_desc_t, mem_space_in>,
-            gpu_arch::Dg2>;
+            subgroup::msg_type_v<softmax_tile_desc_t, mem_space_in>, arch_tag>;
 
     // this tile will store the softmax result to global memory
     using softmax_store_t = subgroup::tile_t<dtype_out, softmax_tile_desc_t>;
     using softmax_store_payload_t = subgroup::mem_payload_t<
             mem_desc_t<dtype_out, mem_layout::row_major, mem_space_out>,
             softmax_tile_desc_t,
-            subgroup::msg_type_v<softmax_tile_desc_t, mem_space_out>,
-            gpu_arch::Dg2>;
+            subgroup::msg_type_v<softmax_tile_desc_t, mem_space_out>, arch_tag>;
 
     struct arguments_t {
         // available while original data is from SLM
@@ -113,10 +111,10 @@ struct xetla_softmax_fwd_t {
             row_data_32 = softmax_load.reg.xetla_select<softmax_size, 1>(0);
 
             // get max
-            float xmax = hmax<float, float, softmax_size>(row_data_32);
+            float x_max = hmax<float, float, softmax_size>(row_data_32);
 
             // get exp_sum
-            row_data_32 -= xmax;
+            row_data_32 -= x_max;
             row_data_32 = exp(row_data_32);
             float exp_sum = sum<float, float, softmax_size>(row_data_32);
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 1f50f5d7d..193696628 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,6 +1,11 @@
 include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CMAKE_SOURCE_DIR})
 
+# Creates a separate device code module for each SYCL* kernel
+# so that kernel for Dg2 and Xe will be JIT separately
+add_compile_options(-fsycl-device-code-split=per_kernel)
+add_link_options(-fsycl-device-code-split=per_kernel)
+
 add_subdirectory(01_gemm_universal)
 add_subdirectory(02_basic_gemm)
 add_subdirectory(03_gemm_relu_bias)
@@ -13,4 +18,4 @@ add_subdirectory(09_gate_recurrent_unit)
 add_subdirectory(10_gemm_large_n)
 if(UNIX)  # pvc not available on win?
 add_subdirectory(11_stream_k_gemm)
-endif()
\ No newline at end of file
+endif()
diff --git a/include/common/utils/raw_send_nbarrier.hpp b/include/common/utils/raw_send_nbarrier.hpp
index 5050b9c51..7bde822b0 100644
--- a/include/common/utils/raw_send_nbarrier.hpp
+++ b/include/common/utils/raw_send_nbarrier.hpp
@@ -41,8 +41,12 @@ enum class nbarrier_role : uint8_t {
 /// as consumer.
 ///
 template <uint8_t num_producers = 1, uint8_t num_consumers = 1,
-        gpu_arch arch_tag = gpu_arch::Xe>
-struct xetla_nbarrier_t {
+        gpu_arch arch_tag = gpu_arch::Xe, typename enable = void>
+struct xetla_nbarrier_t;
+
+template <uint8_t num_producers, uint8_t num_consumers, gpu_arch arch_tag>
+struct xetla_nbarrier_t<num_producers, num_consumers, arch_tag,
+        std::enable_if_t<arch_tag == gpu_arch::Xe>> {
     ///
     /// @brief Description of named barrier objection.
     /// Structure is defined in
@@ -87,8 +91,9 @@ struct xetla_nbarrier_t {
     }
 };
 
-template <uint8_t num_producers, uint8_t num_consumers>
-struct xetla_nbarrier_t<num_producers, num_consumers, gpu_arch::Dg2> {
+template <uint8_t num_producers, uint8_t num_consumers, gpu_arch arch_tag>
+struct xetla_nbarrier_t<num_producers, num_consumers, arch_tag,
+        std::enable_if_t<arch_tag != gpu_arch::Xe>> {
     ///
     /// @brief Description of named barrier objection.
     /// Structure is defined in
@@ -106,24 +111,15 @@ struct xetla_nbarrier_t<num_producers, num_consumers, gpu_arch::Dg2> {
 
     /// @brief Generic work-group split barrier.
     ///
-    __XETLA_API void arrive() {
-        // __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::signal>();
-        __ESIMD_NS::barrier();
-    }
+    __XETLA_API void arrive() { __ESIMD_NS::barrier(); }
 
     /// @brief named barrier wait within subgroup.
     ///
-    __XETLA_API void wait() {
-        // __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::wait>();
-        __ESIMD_NS::barrier();
-    }
+    __XETLA_API void wait() { __ESIMD_NS::barrier(); }
 
     /// @brief named barrier signal from subgroup.
     ///
-    __XETLA_API void arrive_wait() {
-        arrive();
-        wait();
-    }
+    __XETLA_API void arrive_wait() { __ESIMD_NS::barrier(); }
 };
 
 /// @} xetla_util_named_barrier
diff --git a/include/group/gemm/compute_policy.hpp b/include/group/gemm/compute_policy.hpp
index d22c45829..aa4343a28 100644
--- a/include/group/gemm/compute_policy.hpp
+++ b/include/group/gemm/compute_policy.hpp
@@ -55,7 +55,6 @@ struct compute_policy_default_xmx<compute_attr_, perf_tuning_knob_, arch_tag_,
     static constexpr uint32_t block_size_y_a = 16;
 
     static constexpr uint32_t block_size_x_b = arch_tag < gpu_arch::Xe ? 8 : 16;
-
     static constexpr uint32_t block_bytes_y_b = 32;
     static constexpr uint32_t block_size_y_b
             = block_bytes_y_b / sizeof(dtype_mma_b);
@@ -119,7 +118,6 @@ struct compute_policy_default_fpu<compute_attr_, perf_tuning_knob_, arch_tag_,
     static constexpr int stages = perf_tuning_knob::stages;
     static constexpr int sync_freq = perf_tuning_knob::sync_freq;
     static constexpr gpu_arch arch_tag = arch_tag_;
-
     using dtype_mma_acc = typename compute_attr::dtype_acc;
     using dtype_mma_a = typename compute_attr::dtype_a;
     using dtype_mma_b = typename compute_attr::dtype_b;
diff --git a/include/kernel/default_config/common.hpp b/include/kernel/default_config/common.hpp
index c0910b3e9..046eb3237 100644
--- a/include/kernel/default_config/common.hpp
+++ b/include/kernel/default_config/common.hpp
@@ -52,8 +52,38 @@ enum class tune_key : uint8_t {
     dispatch_policy,
     group_swizzle_policy,
     param_optimizer_type,
+    param_optimizer_mode,
     source_location
 };
+template <typename T>
+using data_type_a_t =
+        typename T::template find_elem_t<tune_key::data_type_a>::type;
+template <typename T>
+using data_type_b_t =
+        typename T::template find_elem_t<tune_key::data_type_b>::type;
+template <typename T>
+using data_type_c_t =
+        typename T::template find_elem_t<tune_key::data_type_c>::type;
+template <typename T>
+constexpr auto memory_layout_a_v
+        = T::template find_elem_v<tune_key::memory_layout_a>;
+template <typename T>
+constexpr auto memory_alignment_a_v
+        = T::template find_elem_v<tune_key::memory_alignment_a>;
+template <typename T>
+constexpr auto memory_layout_b_v
+        = T::template find_elem_v<tune_key::memory_layout_b>;
+template <typename T>
+constexpr auto memory_alignment_b_v
+        = T::template find_elem_v<tune_key::memory_alignment_b>;
+template <typename T>
+constexpr auto memory_layout_c_v
+        = T::template find_elem_v<tune_key::memory_layout_c>;
+template <typename T>
+constexpr auto memory_alignment_c_v
+        = T::template find_elem_v<tune_key::memory_alignment_c>;
+template <typename T>
+constexpr auto gpu_arch_v = T::template find_elem_v<tune_key::gpu_arch>;
 
 enum class tune_key_value : uint8_t {
     pre_processing_default,
@@ -68,45 +98,24 @@ enum class tune_key_value : uint8_t {
 // parameter optimizer
 
 enum class param_optimizer_tag : uint8_t { kernel, work_group };
+enum class param_optimizer_mode : uint8_t { full, keep_shape };
 
 template <param_optimizer_tag tag_, typename dict_t_>
 struct param_optimizer;
 
 struct param_optimizer_base {
     template <typename T, typename U>
-    struct validate_attribute {
-        static constexpr bool value = []() constexpr {
-            bool valid = true;
-            valid &= std::is_same<typename T::template find_elem_t<
-                                          tune_key::data_type_a>::type,
-                    typename U::template find_elem_t<
-                            tune_key::data_type_a>::type>::value;
-            valid &= T::template find_elem_v<tune_key::
-                                     memory_layout_a> == U::template find_elem_v<tune_key::memory_layout_a>;
-            valid &= T::template find_elem_v<tune_key::
-                                     memory_alignment_a> == U::template find_elem_v<tune_key::memory_alignment_a>;
-            valid &= std::is_same<typename T::template find_elem_t<
-                                          tune_key::data_type_b>::type,
-                    typename U::template find_elem_t<
-                            tune_key::data_type_b>::type>::value;
-            valid &= T::template find_elem_v<tune_key::
-                                     memory_layout_b> == U::template find_elem_v<tune_key::memory_layout_b>;
-            valid &= T::template find_elem_v<tune_key::
-                                     memory_alignment_b> == U::template find_elem_v<tune_key::memory_alignment_b>;
-            valid &= std::is_same<typename T::template find_elem_t<
-                                          tune_key::data_type_c>::type,
-                    typename U::template find_elem_t<
-                            tune_key::data_type_c>::type>::value;
-            valid &= T::template find_elem_v<tune_key::
-                                     memory_layout_c> == U::template find_elem_v<tune_key::memory_layout_c>;
-            valid &= T::template find_elem_v<tune_key::
-                                     memory_alignment_c> == U::template find_elem_v<tune_key::memory_alignment_c>;
-            valid &= T::template find_elem_v<tune_key::
-                                     gpu_arch> == U::template find_elem_v<tune_key::gpu_arch>;
-            return valid;
-        }
-        ();
-    };
+    static constexpr bool valid_attribute_v
+        = std::is_same_v<data_type_a_t<T>, data_type_a_t<U>>   //
+        && memory_layout_a_v<T> == memory_layout_a_v<U>        //
+        && memory_alignment_a_v<T> == memory_alignment_a_v<U>  //
+        && std::is_same_v<data_type_b_t<T>, data_type_b_t<U>>  //
+        && memory_layout_b_v<T> == memory_layout_b_v<U>        //
+        && memory_alignment_b_v<T> == memory_alignment_b_v<U>  //
+        && std::is_same_v<data_type_c_t<T>, data_type_c_t<U>>  //
+        && memory_layout_c_v<T> == memory_layout_c_v<U>        //
+        && memory_alignment_c_v<T> == memory_alignment_c_v<U>  //
+        && gpu_arch_v<T> == gpu_arch_v<U>;
 };
 
 // parameter adaptor
diff --git a/include/kernel/default_config/decision_tree_policy.hpp b/include/kernel/default_config/decision_tree_policy.hpp
index c8b0b3c21..f9d89fbd5 100644
--- a/include/kernel/default_config/decision_tree_policy.hpp
+++ b/include/kernel/default_config/decision_tree_policy.hpp
@@ -264,53 +264,47 @@ struct kslicing_handler {
 };
 } // namespace decision_tree_rule
 
-template <typename dict_t_, typename opt_dict_t_>
+template <typename dict, typename opt_dict>
 struct fallback_optimizer {
-    using type = typename opt_dict_t_::template update_t<
-            elem_t_t<tune_key::data_type_a,
-                    typename dict_t_::template find_elem_t<
-                            tune_key::data_type_a>::type>,
-            elem_t_t<tune_key::data_type_b,
-                    typename dict_t_::template find_elem_t<
-                            tune_key::data_type_b>::type>,
-            elem_t_t<tune_key::data_type_c,
-                    typename dict_t_::template find_elem_t<
-                            tune_key::data_type_c>::type>,
-            elem_v_t<tune_key::memory_layout_a,
-                    dict_t_::template find_elem_v<tune_key::memory_layout_a>>,
-            elem_v_t<tune_key::memory_layout_b,
-                    dict_t_::template find_elem_v<tune_key::memory_layout_b>>,
-            elem_v_t<tune_key::memory_layout_c,
-                    dict_t_::template find_elem_v<tune_key::memory_layout_c>>,
-            elem_v_t<tune_key::memory_alignment_a,
-                    dict_t_::template find_elem_v<
-                            tune_key::memory_alignment_a>>,
-            elem_v_t<tune_key::memory_alignment_b,
-                    dict_t_::template find_elem_v<
-                            tune_key::memory_alignment_b>>,
-            elem_v_t<tune_key::memory_alignment_c,
-                    dict_t_::template find_elem_v<
-                            tune_key::memory_alignment_c>>,
-            elem_v_t<tune_key::gpu_arch,
-                    dict_t_::template find_elem_v<tune_key::gpu_arch>>>;
+    using type = typename opt_dict::template update_t<
+            elem_t_t<tune_key::data_type_a, data_type_a_t<dict>>,
+            elem_t_t<tune_key::data_type_b, data_type_b_t<dict>>,
+            elem_t_t<tune_key::data_type_c, data_type_c_t<dict>>,
+            elem_v_t<tune_key::memory_layout_a, memory_layout_a_v<dict>>,
+            elem_v_t<tune_key::memory_layout_b, memory_layout_b_v<dict>>,
+            elem_v_t<tune_key::memory_layout_c, memory_layout_c_v<dict>>,
+            elem_v_t<tune_key::memory_alignment_a, memory_alignment_a_v<dict>>,
+            elem_v_t<tune_key::memory_alignment_b, memory_alignment_b_v<dict>>,
+            elem_v_t<tune_key::memory_alignment_c, memory_alignment_c_v<dict>>,
+            elem_v_t<tune_key::gpu_arch, gpu_arch_v<dict>>>;
 };
 
-template <param_optimizer_tag tag_, typename dict_t_, typename... candidates_t>
+template <param_optimizer_tag tag_, typename dict_t_,
+        param_optimizer_mode mode_, typename... candidates_t>
 struct decision_tree_optimizer : param_optimizer_base {
     struct impl {
-        using type = typename dict_t_ ::template update_generator_t<
-                decision_tree_rule::data_type_handler>::
-                template update_generator_t<
-                        decision_tree_rule::tile_shape_handler>::
-                        template update_generator_t<
-                                decision_tree_rule::kslicing_handler>;
+        template <typename T, template <typename> typename G>
+        using apply_handeler = T::template update_generator_t<G>;
+        static constexpr bool keep_shape
+                = (mode_ == param_optimizer_mode::keep_shape);
+
+        using t0 = dict_t_;
+        using t1 = apply_handeler<t0, decision_tree_rule::data_type_handler>;
+        using t2_0 = apply_handeler<t1, decision_tree_rule::tile_shape_handler>;
+        using t2 = std::conditional_t<keep_shape, t1, t2_0>;
+        using t3 = apply_handeler<t2, decision_tree_rule::kslicing_handler>;
+
+        using type = t3;
+
+        // If any of data_type / mem_layout / mem_align is changed,
+        // then change it back via fallback_optimizer
         using fallback_type = fallback_optimizer<dict_t_, type>;
     };
     static constexpr bool use_fallback
-            = !(param_optimizer_base::template validate_attribute<dict_t_,
-                    typename impl::type>::value);
-    using type = typename std::conditional<use_fallback,
-            typename impl::fallback_type, impl>::type::type;
+            = !(param_optimizer_base::template valid_attribute_v<dict_t_,
+                    typename impl::type>);
+    using type = typename std::conditional_t<use_fallback,
+            typename impl::fallback_type, impl>::type;
 };
 
 } // namespace gpu::xetla
diff --git a/include/kernel/default_config/dummy_policy.hpp b/include/kernel/default_config/dummy_policy.hpp
index 7bed9f2c9..b18c882e7 100644
--- a/include/kernel/default_config/dummy_policy.hpp
+++ b/include/kernel/default_config/dummy_policy.hpp
@@ -255,8 +255,8 @@ struct dummy_optimizer : param_optimizer_base {
         using fallback_type = fallback_optimizer<dict_t_, type>;
     };
     static constexpr bool use_fallback
-            = !(param_optimizer_base::template validate_attribute<dict_t_,
-                    typename impl::type>::value);
+            = !(param_optimizer_base::template valid_attribute_v<dict_t_,
+                    typename impl::type>);
     using type = typename std::conditional<use_fallback,
             typename impl::fallback_type, impl>::type::type;
 };
diff --git a/include/kernel/gemm/default_gemm.hpp b/include/kernel/gemm/default_gemm.hpp
index 502b28076..ffea4a0b4 100644
--- a/include/kernel/gemm/default_gemm.hpp
+++ b/include/kernel/gemm/default_gemm.hpp
@@ -73,8 +73,11 @@ struct param_optimizer<param_optimizer_tag::kernel, dict_t_> {
                                param_optimizer_type> != dict_t_::impl::key_not_found)
             && (dict_t_::template find_elem_v<tune_key::
                                 param_optimizer_type> == tune_key_value::param_optimizer_decision_tree);
+    static constexpr auto optimizer_mode
+            = dict_t_::template find_elem_v<tune_key::param_optimizer_mode>;
     using type = typename std::conditional<use_rule,
-            decision_tree_optimizer<param_optimizer_tag::kernel, dict_t_>,
+            decision_tree_optimizer<param_optimizer_tag::kernel, dict_t_,
+                    optimizer_mode>,
             dummy_optimizer<param_optimizer_tag::kernel, dict_t_,
                     kernel::param_kslicing_g1l1_t,
                     kernel::param_kslicing_g2l1_t,
@@ -199,8 +202,11 @@ struct param_optimizer<param_optimizer_tag::work_group, dict_t_> {
                                param_optimizer_type> != dict_t_::impl::key_not_found)
             && (dict_t_::template find_elem_v<tune_key::
                                 param_optimizer_type> == tune_key_value::param_optimizer_decision_tree);
+    static constexpr auto optimizer_mode
+            = dict_t_::template find_elem_v<tune_key::param_optimizer_mode>;
     using type = typename std::conditional<use_rule,
-            decision_tree_optimizer<param_optimizer_tag::work_group, dict_t_>,
+            decision_tree_optimizer<param_optimizer_tag::work_group, dict_t_,
+                    optimizer_mode>,
             dummy_optimizer<param_optimizer_tag::work_group, dict_t_,
                     group::param_dict1_wg_t>>::type::type;
 };
diff --git a/include/kernel/gemm/gemm_preset.hpp b/include/kernel/gemm/gemm_preset.hpp
index fde7f21ab..d62d8c2b0 100644
--- a/include/kernel/gemm/gemm_preset.hpp
+++ b/include/kernel/gemm/gemm_preset.hpp
@@ -71,7 +71,9 @@ using default_param_t = dict_t<>::template update_dict_t<
                 elem_t_t<tune_key::wg_tile_shape, shape<256, 256>>,
                 elem_t_t<tune_key::sg_tile_shape, shape<64, 32>>,
                 elem_v_t<tune_key::param_optimizer_type,
-                        tune_key_value::param_optimizer_dummy>>;
+                        tune_key_value::param_optimizer_dummy>,
+                elem_v_t<tune_key::param_optimizer_mode,
+                        param_optimizer_mode::full, param_optimizer_mode>>;
 
 namespace kernel {
 using param_kslicing_g1l1_t = default_param_t::template update_t<
diff --git a/include/subgroup/tile/impl/payload_xe.hpp b/include/subgroup/tile/impl/payload_xe.hpp
index 302167448..bd4934a1f 100644
--- a/include/subgroup/tile/impl/payload_xe.hpp
+++ b/include/subgroup/tile/impl/payload_xe.hpp
@@ -1308,7 +1308,7 @@ struct prefetch_payload_t<
         tile_desc_t<tile_size_x_, tile_size_y_, block_size_x_, block_size_y_,
                 reg_layout_>,
         num_coop_sg_, arch_tag_,
-        std::enable_if_t<(arch_tag_ == gpu_arch::Dg2
+        std::enable_if_t<(arch_tag_ <= gpu_arch::Dg2
                 && (tile_size_y_ != 1 || block_size_y_ != 1))>> {
     using dtype = dtype_;
     using mem_desc_t
diff --git a/tests/integration/gemm/fp16/common.hpp b/tests/integration/gemm/fp16/common.hpp
index 8a6ceaf16..10c859631 100644
--- a/tests/integration/gemm/fp16/common.hpp
+++ b/tests/integration/gemm/fp16/common.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include "kernel_func.hpp"
+#include <gtest/gtest.h>
 #include <utils/buff_compare.hpp>
 #include <utils/common.hpp>
-#include <gtest/gtest.h>
 
 class TestBase {
 public:
@@ -41,26 +41,6 @@ class TestBase {
     static constexpr mma_engine engine = mma_engine::xmx;
 };
 
-class Test : public TestBase {
-public:
-    static constexpr size_t mat_m = 256;
-    static constexpr size_t mat_n = 256;
-    static constexpr size_t mat_k = 256;
-    static constexpr size_t wg_m = 8;
-    static constexpr size_t wg_n = 32;
-    static constexpr size_t sg_m = 8;
-    static constexpr size_t sg_n = 16;
-    static constexpr size_t sg_k = 32;
-    static constexpr uint32_t global_kslicing = 1;
-    static constexpr uint32_t local_kslicing = 1;
-    static constexpr mem_layout layout_a = mem_layout::row_major;
-    static constexpr mem_layout layout_b = mem_layout::row_major;
-    using data_type_a = fp16;
-    using data_type_b = fp16;
-    using data_type_c = fp16;
-    using data_type_acc = float;
-};
-
 class Test0 : public TestBase {
 public:
     static constexpr size_t mat_m = 256;
diff --git a/tests/integration/gemm/fp16/kernel_func.hpp b/tests/integration/gemm/fp16/kernel_func.hpp
index adaef295d..98fdb9572 100644
--- a/tests/integration/gemm/fp16/kernel_func.hpp
+++ b/tests/integration/gemm/fp16/kernel_func.hpp
@@ -29,8 +29,8 @@ template <typename dtype_a, typename dtype_b, typename dtype_c,
         uint32_t global_kslicing, uint32_t local_kslicing, mma_engine engine>
 struct fp16_gemm_test_func {
     using tile_shape = tile_shape_t<wg_n, wg_m, sg_n, sg_m>;
-    static constexpr uint32_t periodic_sync_interval = 0;
-    static constexpr uint32_t prefetch_distance = 0;
+    static constexpr uint32_t periodic_sync_interval = 8;
+    static constexpr uint32_t prefetch_distance = 3;
 
     using compute_attr = typename std::conditional<(engine == mma_engine::fpu),
             compute_attr_t<dtype_acc, dtype_acc, dtype_acc>,
@@ -40,9 +40,9 @@ struct fp16_gemm_test_func {
     using compute_policy =
             typename std::conditional<(engine == mma_engine::fpu),
                     compute_policy_default_fpu<compute_attr, perf_tuning_knob,
-                            gpu_arch::Dg2>,
+                            gpu_arch::Xe>,
                     compute_policy_default_xmx<compute_attr, perf_tuning_knob,
-                            gpu_arch::Dg2>>::type;
+                            gpu_arch::Xe>>::type;
 
     using mem_desc_input_a = mem_desc_t<dtype_a, layout_a, mem_space::global>;
     using mem_desc_input_b = mem_desc_t<dtype_b, layout_b, mem_space::global>;
@@ -52,11 +52,11 @@ struct fp16_gemm_test_func {
     using gemm_t = gemm_t<compute_policy, tile_shape, mem_desc_input_a,
             mem_desc_input_b>;
 
-    using epilogue_t = epilogue_t<epilogue_policy_default<gpu_arch::Dg2>,
+    using epilogue_t = epilogue_t<epilogue_policy_default<gpu_arch::Xe>,
             tile_shape, mem_desc_output_c>;
 
     using group_swizzle
-            = gpu::xetla::kernel::group_swizzle_default<gpu_arch::Dg2>;
+            = gpu::xetla::kernel::group_swizzle_default<gpu_arch::Xe>;
 
     using dispatch_policy = dispatch_policy_kslicing<group_swizzle,
             global_kslicing, local_kslicing>;
diff --git a/tests/utils/execution.hpp b/tests/utils/execution.hpp
index 3d85114da..66519472e 100644
--- a/tests/utils/execution.hpp
+++ b/tests/utils/execution.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <iomanip>
 #include "common.hpp"
 #include "profiling.hpp"
 #include "xetla.hpp"
@@ -89,11 +90,13 @@ void gemm_exec(const std::string &compile_str, size_t batch = 1) {
         std::vector<kernel_id> kernelId = {get_kernel_id<Test>()};
         auto inputBundle
                 = get_kernel_bundle<bundle_state::input>(context, kernelId);
-        static const std::string env_set_str = "SYCL_PROGRAM_COMPILE_OPTIONS="+compile_str;
-        putenv(const_cast<char*>(env_set_str.c_str()));
+        static const std::string env_set_str
+                = "SYCL_PROGRAM_COMPILE_OPTIONS=" + compile_str;
+        putenv(const_cast<char *>(env_set_str.c_str()));
         kernel_bundle<bundle_state::executable> exeBundle = build(inputBundle);
-        static const std::string env_unset_str = "SYCL_PROGRAM_COMPILE_OPTIONS=";
-        putenv(const_cast<char*>(env_unset_str.c_str()));
+        static const std::string env_unset_str
+                = "SYCL_PROGRAM_COMPILE_OPTIONS=";
+        putenv(const_cast<char *>(env_unset_str.c_str()));
 
         using namespace gpu::xetla::group;
         using namespace gpu::xetla::kernel;
@@ -227,3 +230,95 @@ void kernel_run(auto nd_range, auto validate_result) {
     free(B_host);
     free(C_host);
 }
+
+template <template <gpu_arch> class F>
+class dispatch_arch {
+    using T_RET = std::invoke_result_t<decltype(F<gpu_arch::Xe>::exec)>;
+
+public:
+    template <typename... Args>
+    static T_RET exec(Args &&...args) {
+        // save default formatting
+        std::ios fmt_bak(nullptr);
+        fmt_bak.copyfmt(std::cout);
+
+        sycl::device device;
+        if (!device.has(aspect::ext_intel_device_id)) {
+            std::cout << "Can not get device ID\n";
+            return;
+        }
+        auto deviceID = device.get_info<ext::intel::info::device::device_id>();
+        std::cout << "deviceID: 0x" << std::hex //
+                  << std::right << std::setfill('0') << deviceID << "\n";
+
+        // restore default formatting
+        std::cout.copyfmt(fmt_bak);
+#if defined(SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE) \
+        && SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE
+        // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc#feature-test-macro
+        namespace ENS = sycl::ext::oneapi::experimental;
+        auto deviceArch = device.get_info<ENS::info::device::architecture>();
+        switch (deviceArch) {
+            case ENS::architecture::intel_gpu_pvc:
+                return F<gpu_arch::Xe>::exec(std::forward<Args>(args)...);
+                return;
+            case ENS::architecture::intel_gpu_dg2_g10:
+            case ENS::architecture::intel_gpu_dg2_g11:
+            case ENS::architecture::intel_gpu_dg2_g12:
+                return F<gpu_arch::Dg2>::exec(std::forward<Args>(args)...);
+                return;
+            default: break;
+        }
+
+#endif
+        std::cout << "No maching architecture, checking device ID ...\n";
+        switch (deviceID) {
+            // DG2 devices: https://gfxspecs.intel.com/Predator/Home/Index/44477
+            case 0x56a0: // Intel® Arc ™ A770 Graphics
+            case 0x56a1: // Intel® Arc ™ A750 Graphics
+            case 0x56a2: // Intel® Arc ™ A580 Graphics
+            case 0x5690: // Intel® Arc ™ A770M Graphics
+            case 0x5691: // Intel® Arc ™ A730M Graphics
+            case 0x5692: // Intel® Arc ™ A550M Graphics
+                return F<gpu_arch::Dg2>::exec(std::forward<Args>(args)...);
+            // PVC devices: https://gfxspecs.intel.com/Predator/Home/Index/44484
+            case 0x0bda: //
+                return F<gpu_arch::Xe>::exec(std::forward<Args>(args)...);
+            default: std::cout << "Unknown device ID \n"; return;
+        }
+    }
+};
+
+void print_device_details(const sycl::device &d) {
+    std::cout << "Running on " << d.get_info<info::device::name>() << "\n";
+    std::cout << "  max_compute_units: "
+              << d.get_info<info::device::max_compute_units>() << "\n";
+    std::cout << "  max_work_group_size: "
+              << d.get_info<info::device::max_work_group_size>() << "\n";
+    std::cout << "  max_num_sub_groups: "
+              << d.get_info<info::device::max_num_sub_groups>() << "\n";
+    std::cout << "  global_mem_size: "
+              << d.get_info<info::device::global_mem_size>() << "\n";
+    std::cout << "  local_mem_size: "
+              << d.get_info<info::device::local_mem_size>() << "\n";
+    const auto max_wi_sizes
+            = d.get_info<info::device::max_work_item_sizes<3>>();
+    std::cout << "  max_work_item_sizes: " << max_wi_sizes[0] << " "
+              << max_wi_sizes[1] << " " << max_wi_sizes[2] << "\n";
+    std::cout << "  sub_group_sizes:";
+    const auto d_sg_sizes = d.get_info<info::device::sub_group_sizes>();
+    for (const auto sg_size : d_sg_sizes) {
+        std::cout << " " << sg_size;
+    }
+    std::cout << "\n";
+    if (d.has(aspect::ext_intel_gpu_subslices_per_slice)) {
+        auto subslices = d.get_info<
+                ext::intel::info::device::gpu_subslices_per_slice>();
+        std::cout << "  gpu_subslices_per_slice: " << subslices << "\n";
+    }
+    if (d.has(aspect::ext_intel_gpu_eu_count_per_subslice)) {
+        auto euCount = d.get_info<
+                ext::intel::info::device::gpu_eu_count_per_subslice>();
+        std::cout << "  gpu_eu_count_per_subslice: " << euCount << "\n";
+    }
+}

From 06a919b0f739cd8aab1ae1ef0f7a02206f28145d Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Thu, 7 Mar 2024 18:34:43 +0000
Subject: [PATCH 06/11] example 01 && exmaple 02 && typos

---
 CMakeLists.txt                                |  2 +-
 examples/01_gemm_universal/gemm_universal.cpp | 83 ++++++++++---------
 examples/02_basic_gemm/basic_gemm.cpp         | 60 +++++++-------
 examples/03_gemm_relu_bias/gemm_relu_bias.cpp | 10 +--
 .../04_gemm_polynomial/gemm_polynomial.cpp    |  6 +-
 examples/05_batch_gemm/batch_gemm.cpp         |  4 +-
 examples/05_batch_gemm/batch_gemm.hpp         |  4 +-
 examples/06_gemm_softmax/gemm_softmax.cpp     | 16 ++--
 .../multi_layer_perceptron.cpp                |  8 +-
 .../scaled_dot_product_attention.cpp          | 55 ++++++------
 examples/10_gemm_large_n/gemm_large_n.cpp     |  6 +-
 examples/11_stream_k_gemm/stream_k_gemm.cpp   | 10 +--
 include/common/core/arch_config.hpp           |  2 +
 .../gemm/impl/int4_dequantize_kslicing_xe.hpp |  9 +-
 include/kernel/gemm/default_gemm.hpp          | 52 +++++++-----
 include/kernel/gemm/gemm_preset.hpp           | 25 +++---
 include/kernel/gemm/impl/default_xe.hpp       |  4 +-
 include/kernel/gemm/impl/kslicing_xe.hpp      |  4 +-
 include/kernel/gemm/impl/stream_k_xe.hpp      |  4 +-
 media/docs/construct_a_gemm.md                | 16 ++--
 .../default_config/group_gemm/kernel_func.hpp |  6 +-
 .../kernel_gemm/kernel_func.hpp               |  2 +-
 tests/integration/gemm/bf16_stream_k/main.cpp | 16 ++--
 tests/utils/execution.hpp                     | 28 +++++--
 24 files changed, 227 insertions(+), 205 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d305e5a65..8a6979374 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,7 @@ endif ()
 add_compile_options(-fsycl)
 add_link_options(-fsycl)
 if(UNIX)
-    add_compile_options(-fp-model=precise -Wall -Wextra -Werror)
+    add_compile_options(-fp-model=precise -Wall -Wextra )
     add_link_options(-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm)
     link_libraries(-lgtest -lgtest_main)
 else() # Windows
diff --git a/examples/01_gemm_universal/gemm_universal.cpp b/examples/01_gemm_universal/gemm_universal.cpp
index 5144c2e16..a895c303a 100644
--- a/examples/01_gemm_universal/gemm_universal.cpp
+++ b/examples/01_gemm_universal/gemm_universal.cpp
@@ -18,7 +18,8 @@
 
 enum class kslicing_impl_t : uint8_t { none = 0, global = 1, local = 2 };
 
-template <kslicing_impl_t kslicing_type = kslicing_impl_t::none>
+template <gpu_arch arch_tag,
+        kslicing_impl_t kslicing_type = kslicing_impl_t::none>
 void gemm_universal_run(uint32_t iter) {
     // Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
     // Please make sure you fully understand these configurations before you do any modifications, incomplete changes may lead to unexpected behaviors.
@@ -82,7 +83,7 @@ void gemm_universal_run(uint32_t iter) {
     constexpr uint32_t num_local_splitk
             = (kslicing_type == kslicing_impl_t::local) ? 2 : 1;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option = dict_t<
             elem_v_t<tune_key::param_optimizer_type,
                     tune_key_value::param_optimizer_decision_tree>,
@@ -91,11 +92,7 @@ void gemm_universal_run(uint32_t iter) {
                     tune_key_value::dispatch_policy_kslicing>,
             elem_v_t<tune_key::global_kslicing_ratio, num_global_splitk>,
             elem_v_t<tune_key::local_kslicing_ratio, num_local_splitk>,
-            elem_t_t<tune_key::wg_tile_shape, shape<wg_tile_n, wg_tile_m>>,
-            elem_t_t<tune_key::group_swizzle_policy,
-                    gpu::xetla::kernel::group_swizzle_default<gpu_arch::Dg2>>,
-            elem_t_t<tune_key::epilogue_policy,
-                    gpu::xetla::group::epilogue_policy_default<gpu_arch::Dg2>>>;
+            elem_t_t<tune_key::wg_tile_shape, shape<wg_tile_n, wg_tile_m>>>;
     using gemm_op_t = gpu::xetla::kernel::default_gemm_t<
             data_type_a, // input datatype for A
             mem_layout::row_major, // memory layout for A
@@ -106,8 +103,8 @@ void gemm_universal_run(uint32_t iter) {
             data_type_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
-            gpu_arch::Dg2, // GPU arch
+            data_type_acc, // accumulator data type for intermediate results
+            arch_tag, // GPU arch
             tune_option>;
 
     // allocate temp buffers for global split
@@ -188,36 +185,42 @@ void gemm_universal_run(uint32_t iter) {
     free(Cnt, context);
 }
 
+template <gpu_arch arch_tag>
+struct main_wrapper {
+    static constexpr auto exec = []() {
+        // An example code for calculating matrix multiplication using
+        // GEMM_UNIVERSAL API:
+        //   C = A x B
+        // The resulted matrix C is partitioned by the group range
+        // in to multiple blocks. The block matrix
+        //  C<i_w, j_w>
+        // is computed by the workgroup with id: (0, i_w, j_w).
+        // (i_w, j_w) is an element in range specified by group range.
+        // Each thread with index (0, i_s, j_s) inside the same workgroup
+        // is responsible for a sub block of matrix multiplication, which is
+        //   C<i_w, j_w>[i_s*sg_m:(i_s+1):sg_m,j_s*sg_n:(j_s+1)*sg_n]
+
+        // Alternatively, some threads can cooperate on the same sub block
+        // matrix given the same (i_s, j_s), i.e. the index space is extended
+        // from (0, i_s, j_s) to (k_s, i_s, j_s).
+
+        // Another method to achieve the same effect is to extend the index space
+        // in group range, i.e. from (0, i_w, j_w) to (k_w, i_w, j_w)
+
+        // More detailed description referring to the cooperation (kslicing) could
+        // be found in the example 01_gemm_universal with custom implementation
+
+        // basic gemm_universal
+        gemm_universal_run<arch_tag, kslicing_impl_t::none>(10);
+
+        // basic gemm_universal with workgroup cooperation
+        // gemm_universal_run<arch_tag, kslicing_impl_t::global>(10);
+
+        // basic gemm_universal with thread cooperation
+        // gemm_universal_run<arch_tag, kslicing_impl_t::local>(10);
+    };
+};
 int main() {
-    // An example code for calculating matrix multiplication using
-    // GEMM_UNIVERSAL API:
-    //   C = A x B
-    // The resulted matrix C is partitioned by the group range
-    // in to multiple blocks. The block matrix
-    //  C<i_w, j_w>
-    // is computed by the workgroup with id: (0, i_w, j_w).
-    // (i_w, j_w) is an element in range specified by group range.
-    // Each thread with index (0, i_s, j_s) inside the same workgroup
-    // is responsible for a sub block of matrix multiplication, which is
-    //   C<i_w, j_w>[i_s*sg_m:(i_s+1):sg_m,j_s*sg_n:(j_s+1)*sg_n]
-
-    // Alternatively, some threads can cooperate on the same sub block
-    // matrix given the same (i_s, j_s), i.e. the index space is extended
-    // from (0, i_s, j_s) to (k_s, i_s, j_s).
-
-    // Another method to achieve the same effect is to extend the index space
-    // in group range, i.e. from (0, i_w, j_w) to (k_w, i_w, j_w)
-
-    // More detailed description referring to the cooperation (kslicing) could
-    // be found in the example 01_gemm_universal with custom implementation
-
-    // basic gemm_universal
-    gemm_universal_run<kslicing_impl_t::none>(10);
-
-    // basic gemm_universal with workgroup cooperation
-    // gemm_universal_run<kslicing_impl_t::global>(10);
-
-    // basic gemm_universal with thread cooperation
-    // gemm_universal_run<kslicing_impl_t::local>(10);
-    return (0);
+    dispatch_arch<main_wrapper>::exec();
+    return 0;
 }
diff --git a/examples/02_basic_gemm/basic_gemm.cpp b/examples/02_basic_gemm/basic_gemm.cpp
index 330e85e09..44866e82f 100644
--- a/examples/02_basic_gemm/basic_gemm.cpp
+++ b/examples/02_basic_gemm/basic_gemm.cpp
@@ -13,10 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *******************************************************************************/
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
-template <gpu_arch arch_tag_>
+template <gpu_arch arch_tag>
 void basic_gemm_run(sycl::queue queue, uint32_t iter) {
     // Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
     // Please make sure you fully understand these configurations before you do any modifications, incomplete changes may lead to unexpected behaviors.
@@ -110,11 +110,11 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                 // should larger than 8
                 static constexpr uint32_t k_stride = 32;
 
-                // Step 1: define mirco-kernel's configuration
+                // Step 1: define Micro-kernel's configuration
                 using wg_shape = shape<wg_tile_n, wg_tile_m>;
                 using sg_shape = shape<sg_tile_n, sg_tile_m>;
 
-                // Mirco-kernel configuration
+                // Micro-kernel configuration
                 using gemm_tune_option
                         = dict_t<elem_t_t<tune_key::sg_tile_shape, sg_shape>,
                                 elem_v_t<tune_key::prefetch_distance,
@@ -132,10 +132,10 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                         8, // leading dimension for B, in unit of element
                         mem_space::
                                 global, // memory reading from global mem for B
-                        data_type_acc, // accumulator data type for intermediate resutls
+                        data_type_acc, // accumulator data type for intermediate results
                         wg_shape, // computation tile shape
                         k_stride, // elements in each iteration
-                        arch_tag_, // GPU arch
+                        arch_tag, // GPU arch
                         gemm_tune_option>;
                 gemm_t gemm;
 
@@ -149,24 +149,26 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                         mem_space::global, // memory writing to global mem for C
                         wg_shape, // computation tile shape
                         k_stride, // elements in each iteration
-                        arch_tag_, // GPU arch
+                        arch_tag, // GPU arch
                         epilogue_tune_option>;
 
                 // Step 3: define the shared local memory usages
                 // developers have the responsibility to set
-                // shared loacal memory through XeTLA API
+                // shared local memory through XeTLA API
                 static constexpr uint32_t barrier_count = gemm_t::barrier_count;
                 static constexpr uint32_t slm_size = gemm_t::slm_size;
+                static_assert(slm_size <= arch_attr_t<arch_tag>::local_mem_size,
+                        "The local memory size excess!");
                 xetla_nbarrier_init<barrier_count>();
                 xetla_local_init<slm_size>();
 
-                // Step 4: ecah workgroup gets it individual index to start computation
+                // Step 4: each workgroup gets it individual index to start computation
                 int start_n = item.get_group(2) * wg_tile_n;
                 int start_m = item.get_group(1) * wg_tile_m;
                 // no slicing in K direction so start from zero for all WG
                 int start_k = 0;
 
-                // Each workgroup will compute all data in K based on no k_sliciing
+                // Each workgroup will compute all data in K based on no k_slicing
                 // The developer can set how much data a subgroup compute by k_stride
                 uint32_t wg_tile_k = matrix_k;
                 uint32_t inner_loop_count
@@ -183,7 +185,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                 mem_desc_output_c md_c(
                         {C}, {matrix_n, matrix_m, ldc}, {start_n, start_m});
 
-                // Step 6: real calculation with accumulator varibales which suppose
+                // Step 6: real calculation with accumulator variables which suppose
                 // will be in register.
                 typename gemm_t::matAcc_t matAcc;
                 matAcc.init(0);
@@ -194,7 +196,7 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
                 // the results is in the matAcc rather than real output C
                 typename gemm_t::work_group_t g(item.get_local_linear_id());
                 gemm(g, matAcc, gemm_args);
-                // Step 7: write the results from matACC to real output C
+                // Step 7: write the results from matAcc to real output C
                 epilogue_t epilogue;
                 epilogue(g, matAcc, md_c);
             });
@@ -219,23 +221,21 @@ void basic_gemm_run(sycl::queue queue, uint32_t iter) {
     free(C, context);
 }
 
+template <gpu_arch arch_tag>
+struct main_wrapper {
+    static constexpr auto exec = []() {
+        // This case shows how to use batch-reduce (br) GEMM microkernel to
+        // solve a standard GEMM
+        // Turn on the profiling property to facilitate subsequent profiling
+        sycl::property_list properties {
+                sycl::property::queue::enable_profiling()};
+
+        // Define SYCL queue, context and device
+        auto queue = sycl::queue(properties);
+        basic_gemm_run<arch_tag>(queue, 10);
+    };
+};
 int main() {
-    // This case shows how to use batch-reduce (br) GEMM microkernel to
-    // solve a standard GEMM
-    // Turn on the profiling property to facilitate subsequent profiling
-    sycl::property_list properties {sycl::property::queue::enable_profiling()};
-
-    // Define SYCL queue, context and device
-    auto queue = sycl::queue(properties);
-    auto device = queue.get_device();
-
-    // Detect the execution size, 8 for Arc, 16 for PVC.
-    int ExecSize
-            = device.get_info<ext::intel::info::device::gpu_eu_simd_width>();
-    if (ExecSize == 8) {
-        basic_gemm_run<gpu_arch::Dg2>(queue, 10);
-    } else {
-        basic_gemm_run<gpu_arch::Xe>(queue, 10);
-    }
-    return (0);
+    dispatch_arch<main_wrapper>::exec();
+    return 0;
 }
diff --git a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
index f81e946b0..caa820b50 100644
--- a/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
+++ b/examples/03_gemm_relu_bias/gemm_relu_bias.cpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 #include <algorithm>
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 using namespace cl::sycl;
 using namespace gpu::xetla;
@@ -140,7 +140,7 @@ void gemm_relu_bias_run(uint32_t iter) {
     using epilogue_policy
             = xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option = dict_t<
             elem_v_t<tune_key::param_optimizer_type,
                     tune_key_value::param_optimizer_decision_tree>,
@@ -156,7 +156,7 @@ void gemm_relu_bias_run(uint32_t iter) {
             data_type_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             gpu_arch::Xe, // GPU arch
             tune_option>;
     using gemm_op_t = typename default_config_t::type;
@@ -223,7 +223,7 @@ int main() {
     // The purpose of this example is to illustrate the epilogue_t API in XeTLA.
 
     // It allows user to implement multiple Ops inside a kernel call to avoid
-    // overheads in invokation, memory transfer, etc.
+    // overheads in invocation, memory transfer, etc.
     // Take the following python code as an example:
 
     // Original:
@@ -231,7 +231,7 @@ int main() {
     // > x = to.matmul(A, B)
     // > y = to.nn.functional.relu(x)
 
-    // It takes two kernel invokations and the ReLU Op is a elementwise operation
+    // It takes two kernel invocations and the ReLU Op is a elementwise operation
     // that could be fused into MatMul Op, which is basically calling GEMM kernel.
 
     // Fusion:
diff --git a/examples/04_gemm_polynomial/gemm_polynomial.cpp b/examples/04_gemm_polynomial/gemm_polynomial.cpp
index 2aa2b61a9..981561e66 100644
--- a/examples/04_gemm_polynomial/gemm_polynomial.cpp
+++ b/examples/04_gemm_polynomial/gemm_polynomial.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  *******************************************************************************/
 #include <algorithm>
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 #include "gemm_polynomial.hpp"
 
@@ -137,7 +137,7 @@ void gemm_polynomial_run(int iter) {
     using epilogue_policy
             = xetla::group::epilogue_policy_tile_op<tile_op_t, gpu_arch::Xe>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option = dict_t<
             elem_v_t<tune_key::param_optimizer_type,
                     tune_key_value::param_optimizer_decision_tree>,
@@ -154,7 +154,7 @@ void gemm_polynomial_run(int iter) {
             data_type_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             gpu_arch::Xe, // GPU arch
             tune_option>;
 
diff --git a/examples/05_batch_gemm/batch_gemm.cpp b/examples/05_batch_gemm/batch_gemm.cpp
index 3011f26a1..ff66838c1 100644
--- a/examples/05_batch_gemm/batch_gemm.cpp
+++ b/examples/05_batch_gemm/batch_gemm.cpp
@@ -90,7 +90,7 @@ void batch_gemm_run(uint32_t iter) {
     using wg_shape = shape<wg_tile_n, wg_tile_m>;
     using sg_shape = shape<sg_tile_n, sg_tile_m>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using tune_option
             = dict_t<elem_v_t<tune_key::param_optimizer_type,
                              tune_key_value::param_optimizer_decision_tree>,
@@ -106,7 +106,7 @@ void batch_gemm_run(uint32_t iter) {
             mem_layout::row_major, // memory layout for B
             8, // leading dimension for B, in unit of element
             mem_space::global, // memory reading from global mem for B
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             wg_shape, // computation tile shape
             wg_tile_k, // elements in each iteration
             gpu_arch::Xe, // GPU arch
diff --git a/examples/05_batch_gemm/batch_gemm.hpp b/examples/05_batch_gemm/batch_gemm.hpp
index ce2a814d7..fe00dbaa5 100644
--- a/examples/05_batch_gemm/batch_gemm.hpp
+++ b/examples/05_batch_gemm/batch_gemm.hpp
@@ -173,8 +173,8 @@ class batch_gemm_t {
     /// @return The size of local memory required.
     __XETLA_API static constexpr uint32_t get_slm_size() {
         constexpr uint32_t size = gemm_t::slm_size + epilogue_t::slm_size;
-        static_assert(size <= (128 * 1024),
-                "The local memory size should be less than 128KB!");
+        static_assert(size <= arch_attr_t<arch_tag>::local_mem_size,
+                "The local memory size excess!");
         return size;
     };
 
diff --git a/examples/06_gemm_softmax/gemm_softmax.cpp b/examples/06_gemm_softmax/gemm_softmax.cpp
index 724f43a57..cb6562677 100644
--- a/examples/06_gemm_softmax/gemm_softmax.cpp
+++ b/examples/06_gemm_softmax/gemm_softmax.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  *******************************************************************************/
 
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 using namespace gpu::xetla;
 using namespace cl::sycl;
@@ -156,8 +156,8 @@ void gemm_softmax_run(uint32_t iter) {
     cl::sycl::nd_range<3> nd_range(group_range * local_range, local_range);
 
     uint32_t warmup = 10;
-    int64_t ops
-            = 2 * static_cast<int64_t>(matrix_m) * matrix_n * matrix_k * batch_num;
+    int64_t ops = 2 * static_cast<int64_t>(matrix_m) * matrix_n * matrix_k
+            * batch_num;
     profiling_helper prof("gemm_softmax", ops, "gflops");
     try {
         for (uint32_t i = 0; i < iter + warmup; i++) {
@@ -178,11 +178,11 @@ void gemm_softmax_run(uint32_t iter) {
                     // should larger than 8
                     static constexpr uint32_t k_iter_num = 16;
 
-                    // Step 1: define mirco-kernel's configuration
+                    // Step 1: define Micro-kernel's configuration
                     using wg_shape = shape<wg_tile_n, wg_tile_m>;
                     using sg_shape = shape<sg_tile_n, sg_tile_m>;
 
-                    // Mirco-kernel configuration
+                    // Micro-kernel configuration
                     using tune_option = dict_t<
                             elem_v_t<tune_key::param_optimizer_type,
                                     tune_key_value::
@@ -203,7 +203,7 @@ void gemm_softmax_run(uint32_t iter) {
                             8, // leading dimension for B, in unit of element
                             mem_space::
                                     global, // memory reading from global mem for B
-                            data_type_sfx, // accumulator data type for intermediate resutls
+                            data_type_sfx, // accumulator data type for intermediate results
                             wg_shape, // computation tile shape
                             k_iter_num, // elements in each iteration
                             gpu_arch::Xe, // GPU arch
@@ -326,12 +326,12 @@ int main() {
     // Softmax needs entire row data for reduced sum and reduced max,
     // So result of batch-GeMM will be written into SLM.
     // When all thread in a work group finishing their job softmax start.
-    // To simlify the calculation of softmax, we make each single thread
+    // To simplify the calculation of softmax, we make each single thread
     // load entire one row data so that there's no data sharing
     // necessity among threads.
 
     // Description:
-    // This kernel can be descripted as following
+    // This kernel can be described as following
     // mathematical expression:
     //   C = softmax(A · B.transpose(-1, -2))
     // where:
diff --git a/examples/07_multi_layer_perceptron/multi_layer_perceptron.cpp b/examples/07_multi_layer_perceptron/multi_layer_perceptron.cpp
index 25a215075..959947875 100644
--- a/examples/07_multi_layer_perceptron/multi_layer_perceptron.cpp
+++ b/examples/07_multi_layer_perceptron/multi_layer_perceptron.cpp
@@ -164,7 +164,7 @@ void mlp_run(uint32_t iter) {
     using wg_shape_layer1 = shape<wg_tile_n_layer1, wg_tile_m_layer1>;
     using sg_shape_layer1 = shape<sg_tile_n_layer1, sg_tile_m_layer1>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using epilogue_policy_layer1 = xetla::group::epilogue_policy_tile_op<
             xetla::subgroup::chained_tile_op_t<gpu::xetla::subgroup::relu_op_t>,
             gpu_arch::Xe>;
@@ -184,7 +184,7 @@ void mlp_run(uint32_t iter) {
             mem_layout::row_major, // memory layout for W
             8, // leading dimension for W, in unit of element
             mem_space::global, // memory reading from global mem for W
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             wg_shape_layer1, // computation tile shape
             wg_tile_k, // elements in each iteration
             gpu_arch::Xe, // GPU arch
@@ -203,7 +203,7 @@ void mlp_run(uint32_t iter) {
     using wg_shape_layer2 = shape<wg_tile_n_layer2, wg_tile_m_layer2>;
     using sg_shape_layer2 = shape<sg_tile_n_layer2, sg_tile_m_layer2>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using layer2_tune_option
             = dict_t<elem_v_t<tune_key::param_optimizer_type,
                              tune_key_value::param_optimizer_decision_tree>,
@@ -219,7 +219,7 @@ void mlp_run(uint32_t iter) {
             mem_layout::row_major, // memory layout for V
             8, // leading dimension for V, in unit of element
             mem_space::global, // memory reading from global mem for V
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             wg_shape_layer2, // computation tile shape
             wg_tile_k, // elements in each iteration
             gpu_arch::Xe, // GPU arch
diff --git a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
index 420a854c4..de752a855 100644
--- a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -228,8 +228,8 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
 
     constexpr uint32_t slm_size
             = wg_tile_m_qk * wg_tile_n_qk * sizeof(dtype_sfx);
-    XETLA_ASSERT(slm_size <= device.get_info<info::device::local_mem_size>(),
-            "SLM size too large!");
+    static_assert(slm_size <= arch_attr_t<arch_tag>::local_mem_size,
+            "The local memory size excess!");
 
     static_assert(subgroup_range_m * subgroup_range_n == thread_num,
             "Given thread number should equal to pre-set value 32!");
@@ -512,38 +512,33 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
 template <gpu_arch arch_tag>
 struct main_wrapper {
     static constexpr auto exec = []() {
-        if constexpr (arch_tag == gpu_arch::Dg2) {
-            sdp_fwd_run<arch_tag>(10);
-        } else {
-            sdp_fwd_run<arch_tag>(10);
-        }
+        // This example implements scaled-dot-production with batch_size: 16,
+        // num_heads: 16, sequence_length: 512, head_size: 64. It will be shown how to
+        // remap the index space of each work-item used for gemm1, softmax and gemm2.
+
+        // Description:
+        // Scaled-dot-production mechanism can be seen as two chained batch MatMul
+        // with a softmax in the middle layer. It can be described as following
+        // mathematical expression:
+        //   softmax(Q · (K.transpose(-1, -2)) * (1 / sqr_root(num_heads)) +
+        //   attn_mask) · V
+        // where:
+        //   Q, K, V: input data
+        //   shape(Q) = [16 x 16, 512, 64]
+        //   shape(K) = [16 x 16, 512, 64]
+        //   shape(V) = [16 x 16, 512, 64]
+        //   shape(attn_mask) = [16, 512, 512]
+        //   shape(DST) = [16, 512, 16, 64]
+
+        // This kernel is designed to execute the following task:
+        // 1: S = (Q · (K.transpose(-1, -2))) * (1 / sqr_root(num_heads)) + attn_mask
+        // 2: S' = softmax(S)
+        // 3: O = S' · V
+        sdp_fwd_run<arch_tag>(10);
     };
 };
 
 int main() {
-    // This example implements scaled-dot-production with batch_size: 16,
-    // num_heads: 16, sequence_length: 512, head_size: 64. It will be shown how to
-    // remap the index space of each work-item used for gemm1, softmax and gemm2.
-
-    // Description:
-    // Scaled-dot-production mechanism can be seen as two chained batch MatMul
-    // with a softmax in the middle layer. It can be described as following
-    // mathematical expression:
-    //   softmax(Q · (K.transpose(-1, -2)) * (1 / sqr_root(num_heads)) +
-    //   attn_mask) · V
-    // where:
-    //   Q, K, V: input data
-    //   shape(Q) = [16 x 16, 512, 64]
-    //   shape(K) = [16 x 16, 512, 64]
-    //   shape(V) = [16 x 16, 512, 64]
-    //   shape(attn_mask) = [16, 512, 512]
-    //   shape(DST) = [16, 512, 16, 64]
-
-    // This kernel is designed to execute the following task:
-    // 1: S = (Q · (K.transpose(-1, -2))) * (1 / sqr_root(num_heads)) + attn_mask
-    // 2: S' = softmax(S)
-    // 3: O = S' · V
-
     dispatch_arch<main_wrapper>::exec();
     return 0;
 }
diff --git a/examples/10_gemm_large_n/gemm_large_n.cpp b/examples/10_gemm_large_n/gemm_large_n.cpp
index a0e0b599c..8ebe93d31 100644
--- a/examples/10_gemm_large_n/gemm_large_n.cpp
+++ b/examples/10_gemm_large_n/gemm_large_n.cpp
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 #include <chrono>
 #include <thread>
@@ -83,7 +83,7 @@ void gemm_large_n_run(uint32_t iter) {
     // default 8
     static constexpr uint32_t wg_num_n = 8;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using group_swizzle
             = xetla::kernel::group_swizzle_snake<wg_num_n, gpu_arch::Xe>;
 
@@ -106,7 +106,7 @@ void gemm_large_n_run(uint32_t iter) {
             data_type_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             gpu_arch::Xe, // GPU arch
             tune_option>;
 
diff --git a/examples/11_stream_k_gemm/stream_k_gemm.cpp b/examples/11_stream_k_gemm/stream_k_gemm.cpp
index 87bd3fe93..0844dbe3c 100644
--- a/examples/11_stream_k_gemm/stream_k_gemm.cpp
+++ b/examples/11_stream_k_gemm/stream_k_gemm.cpp
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#include <tests/utils/utils.hpp>
 #include "xetla.hpp"
+#include <tests/utils/utils.hpp>
 
 void stream_k_gemm_run(uint32_t iter) {
     // Tips, the example demonstrates programming kernel with XeTLA, it works as expected with current configurations.
@@ -85,7 +85,7 @@ void stream_k_gemm_run(uint32_t iter) {
                     sg_tile_n, //	subgroup size in dim0
                     sg_tile_m>; //	subgroup size in dim1
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using gemm_config = xetla::group::gemm_selector_t<
             data_type_a, // input datatype for A
             data_type_b, // input datatype for B
@@ -95,7 +95,7 @@ void stream_k_gemm_run(uint32_t iter) {
             mem_space::global, // memory reading from global mem for B
             8, // leading dimension for A, in unit of element
             8, // leading dimension for B, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             tile_shape, // computation tile shape
             sg_tile_k, // elements in each iteration
             mma_engine::xmx, // compute engine
@@ -303,7 +303,7 @@ void stream_k_gemm_relu_biasadd_run(uint32_t iter) {
                     sg_tile_n, //	subgroup size in dim0
                     sg_tile_m>; //	subgroup size in dim1
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using gemm_config = xetla::group::gemm_selector_t<
             data_type_a, // input datatype for A
             data_type_b, // input datatype for B
@@ -313,7 +313,7 @@ void stream_k_gemm_relu_biasadd_run(uint32_t iter) {
             mem_space::global, // memory reading from global mem for B
             8, // leading dimension for A, in unit of element
             8, // leading dimension for B, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             tile_shape, // computation tile shape
             sg_tile_k, // elements in each iteration
             mma_engine::xmx, // compute engine
diff --git a/include/common/core/arch_config.hpp b/include/common/core/arch_config.hpp
index 8c020641d..c7f6157d8 100644
--- a/include/common/core/arch_config.hpp
+++ b/include/common/core/arch_config.hpp
@@ -120,6 +120,7 @@ struct arch_attr_t<gpu_arch::Xe> {
     using mma_attr = mma_attr_t<gpu_arch::Xe>;
 
     static constexpr uint32_t max_wg_num = 64;
+    static constexpr uint32_t local_mem_size = 128 * 1024;
 };
 
 template <>
@@ -133,6 +134,7 @@ struct arch_attr_t<gpu_arch::Dg2> {
     using mma_attr = mma_attr_t<gpu_arch::Dg2>;
 
     static constexpr uint32_t max_wg_num = 64;
+    static constexpr uint32_t local_mem_size = 64 * 1024;
 };
 
 /// @} xetla_core_arch_config
diff --git a/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp b/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
index 2df93ab4e..4fb330e5e 100644
--- a/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
+++ b/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
@@ -376,13 +376,8 @@ class gemm_universal_t<dispatch_policy_int4_dequantize_kslicing<group_swizzle_,
     __XETLA_API static constexpr uint32_t get_slm_size() {
         constexpr uint32_t size = gemm_slm_size * num_local_kslicing
                 + kslicing_slm_size + epilogue_slm_size * num_local_kslicing;
-        if constexpr (arch_tag == gpu_arch::Dg2) {
-            static_assert(size <= (64 * 1024),
-                    "The local memory size should be less than 64KB!");
-        } else {
-            static_assert(size <= (128 * 1024),
-                    "The local memory size should be less than 128KB!");
-        }
+        static_assert(size <= arch_attr_t<arch_tag>::local_mem_size,
+                "The local memory size excess!");
         return size;
     }
 
diff --git a/include/kernel/gemm/default_gemm.hpp b/include/kernel/gemm/default_gemm.hpp
index ffea4a0b4..6adbb264c 100644
--- a/include/kernel/gemm/default_gemm.hpp
+++ b/include/kernel/gemm/default_gemm.hpp
@@ -29,12 +29,12 @@ namespace kernel {
 template <typename dtype_a, mem_layout mem_layout_a, uint32_t alignment_a,
         typename dtype_b, mem_layout mem_layout_b, uint32_t alignment_b,
         typename dtype_c, mem_layout mem_layout_c, uint32_t alignment_c,
-        typename dtype_acc, gpu_arch gpu_arch_tag = gpu_arch::Xe,
+        typename dtype_acc, gpu_arch arch_tag = gpu_arch::Xe,
         typename tune_option = dict_t<>>
 struct default_gemm_config_t
     : param_adaptor<param_adaptor_tag::kernel,
               typename param_optimizer<param_optimizer_tag::kernel,
-                      typename default_param_t::template update_dict_t<
+                      typename default_param_t<arch_tag>::template update_dict_t<
                               typename tune_option::template update_t<
                                       elem_t_t<tune_key::data_type_a, dtype_a>,
                                       elem_v_t<tune_key::memory_layout_a,
@@ -54,16 +54,16 @@ struct default_gemm_config_t
                                       elem_t_t<tune_key::data_type_acc,
                                               dtype_acc>,
                                       elem_v_t<tune_key::gpu_arch,
-                                              gpu_arch_tag>>>>::type> {};
+                                              arch_tag>>>>::type> {};
 
 template <typename dtype_a, mem_layout mem_layout_a, uint32_t alignment_a,
         typename dtype_b, mem_layout mem_layout_b, uint32_t alignment_b,
         typename dtype_c, mem_layout mem_layout_c, uint32_t alignment_c,
-        typename dtype_acc, gpu_arch gpu_arch_tag = gpu_arch::Xe,
+        typename dtype_acc, gpu_arch arch_tag = gpu_arch::Xe,
         typename tune_option = dict_t<>>
 using default_gemm_t = typename default_gemm_config_t<dtype_a, mem_layout_a,
         alignment_a, dtype_b, mem_layout_b, alignment_b, dtype_c, mem_layout_c,
-        alignment_c, dtype_acc, gpu_arch_tag, tune_option>::type;
+        alignment_c, dtype_acc, arch_tag, tune_option>::type;
 } // namespace kernel
 
 template <typename dict_t_>
@@ -73,15 +73,20 @@ struct param_optimizer<param_optimizer_tag::kernel, dict_t_> {
                                param_optimizer_type> != dict_t_::impl::key_not_found)
             && (dict_t_::template find_elem_v<tune_key::
                                 param_optimizer_type> == tune_key_value::param_optimizer_decision_tree);
+    static constexpr auto arch_tag
+            = (dict_t_::impl::template find_elem_index<
+                       tune_key::gpu_arch> != dict_t_::impl::key_not_found)
+            ? dict_t_::template find_elem_v<tune_key::gpu_arch>
+            : gpu_arch::Xe;
     static constexpr auto optimizer_mode
             = dict_t_::template find_elem_v<tune_key::param_optimizer_mode>;
     using type = typename std::conditional<use_rule,
             decision_tree_optimizer<param_optimizer_tag::kernel, dict_t_,
                     optimizer_mode>,
             dummy_optimizer<param_optimizer_tag::kernel, dict_t_,
-                    kernel::param_kslicing_g1l1_t,
-                    kernel::param_kslicing_g2l1_t,
-                    kernel::param_kslicing_g1l2_t>>::type::type;
+                    kernel::param_kslicing_g1l1_t<arch_tag>,
+                    kernel::param_kslicing_g2l1_t<arch_tag>,
+                    kernel::param_kslicing_g1l2_t<arch_tag>>>::type::type;
 };
 
 template <typename dict_t_>
@@ -126,12 +131,12 @@ namespace group {
 template <typename dtype_a, mem_layout mem_layout_a, uint32_t alignment_a,
         mem_space mem_space_a, typename dtype_b, mem_layout mem_layout_b,
         uint32_t alignment_b, mem_space mem_space_b, typename dtype_acc,
-        typename wg_shape, uint32_t wg_tile_k,
-        gpu_arch gpu_arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
+        typename wg_shape, uint32_t wg_tile_k, gpu_arch arch_tag = gpu_arch::Xe,
+        typename tune_option = dict_t<>>
 struct default_gemm_selector_config_t
     : param_adaptor<param_adaptor_tag::work_group_gemm,
               typename param_optimizer<param_optimizer_tag::work_group,
-                      typename default_param_t::template update_dict_t<
+                      typename default_param_t<arch_tag>::template update_dict_t<
                               typename tune_option::template update_t<
                                       elem_t_t<tune_key::data_type_a, dtype_a>,
                                       elem_v_t<tune_key::memory_layout_a,
@@ -153,25 +158,25 @@ struct default_gemm_selector_config_t
                                               wg_shape>,
                                       elem_v_t<tune_key::wg_tile_k, wg_tile_k>,
                                       elem_v_t<tune_key::gpu_arch,
-                                              gpu_arch_tag>>>>::type> {};
+                                              arch_tag>>>>::type> {};
 
 template <typename dtype_a, mem_layout mem_layout_a, uint32_t alignment_a,
         mem_space mem_space_a, typename dtype_b, mem_layout mem_layout_b,
         uint32_t alignment_b, mem_space mem_space_b, typename dtype_acc,
-        typename wg_shape, uint32_t wg_tile_k,
-        gpu_arch gpu_arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
+        typename wg_shape, uint32_t wg_tile_k, gpu_arch arch_tag = gpu_arch::Xe,
+        typename tune_option = dict_t<>>
 using default_gemm_selector_t = typename default_gemm_selector_config_t<dtype_a,
         mem_layout_a, alignment_a, mem_space_a, dtype_b, mem_layout_b,
-        alignment_b, mem_space_b, dtype_acc, wg_shape, wg_tile_k, gpu_arch_tag,
+        alignment_b, mem_space_b, dtype_acc, wg_shape, wg_tile_k, arch_tag,
         tune_option>::type;
 
 template <typename dtype_c, mem_layout mem_layout_c, uint32_t alignment_c,
         mem_space mem_space_c, typename wg_shape, uint32_t wg_tile_k,
-        gpu_arch gpu_arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
+        gpu_arch arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
 struct default_epilogue_selector_config_t
     : param_adaptor<param_adaptor_tag::work_group_epilogue,
               typename param_optimizer<param_optimizer_tag::work_group,
-                      typename default_param_t::template update_dict_t<
+                      typename default_param_t<arch_tag>::template update_dict_t<
                               typename tune_option::template update_t<
                                       elem_t_t<tune_key::data_type_c, dtype_c>,
                                       elem_v_t<tune_key::memory_layout_c,
@@ -184,14 +189,14 @@ struct default_epilogue_selector_config_t
                                               wg_shape>,
                                       elem_v_t<tune_key::wg_tile_k, wg_tile_k>,
                                       elem_v_t<tune_key::gpu_arch,
-                                              gpu_arch_tag>>>>::type> {};
+                                              arch_tag>>>>::type> {};
 
 template <typename dtype_c, mem_layout mem_layout_c, uint32_t alignment_c,
         mem_space mem_space_c, typename wg_shape, uint32_t wg_tile_k,
-        gpu_arch gpu_arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
+        gpu_arch arch_tag = gpu_arch::Xe, typename tune_option = dict_t<>>
 using default_epilogue_selector_t =
         typename default_epilogue_selector_config_t<dtype_c, mem_layout_c,
-                alignment_c, mem_space_c, wg_shape, wg_tile_k, gpu_arch_tag,
+                alignment_c, mem_space_c, wg_shape, wg_tile_k, arch_tag,
                 tune_option>::type;
 } // namespace group
 
@@ -204,11 +209,16 @@ struct param_optimizer<param_optimizer_tag::work_group, dict_t_> {
                                 param_optimizer_type> == tune_key_value::param_optimizer_decision_tree);
     static constexpr auto optimizer_mode
             = dict_t_::template find_elem_v<tune_key::param_optimizer_mode>;
+    static constexpr auto arch_tag
+            = (dict_t_::impl::template find_elem_index<
+                       tune_key::gpu_arch> != dict_t_::impl::key_not_found)
+            ? dict_t_::template find_elem_v<tune_key::gpu_arch>
+            : gpu_arch::Xe;
     using type = typename std::conditional<use_rule,
             decision_tree_optimizer<param_optimizer_tag::work_group, dict_t_,
                     optimizer_mode>,
             dummy_optimizer<param_optimizer_tag::work_group, dict_t_,
-                    group::param_dict1_wg_t>>::type::type;
+                    group::param_dict1_wg_t<arch_tag>>>::type::type;
 };
 
 template <typename dict_t_>
diff --git a/include/kernel/gemm/gemm_preset.hpp b/include/kernel/gemm/gemm_preset.hpp
index d62d8c2b0..5e8bfc1dd 100644
--- a/include/kernel/gemm/gemm_preset.hpp
+++ b/include/kernel/gemm/gemm_preset.hpp
@@ -45,26 +45,27 @@ using param_performance_default
                 elem_v_t<tune_key::prefetch_distance, 3UL, uint32_t>,
                 elem_v_t<tune_key::periodic_sync_interval, 8UL, uint32_t>>;
 
+template <gpu_arch arch_tag = gpu_arch::Xe>
 using param_runtime_default
         = dict_t<elem_v_t<tune_key::pre_processing,
                          tune_key_value::pre_processing_default>,
                 elem_v_t<tune_key::mma_engine, mma_engine::xmx>,
-                elem_v_t<tune_key::gpu_arch, gpu_arch::Xe>,
+                elem_v_t<tune_key::gpu_arch, arch_tag>,
                 elem_t_t<tune_key::epilogue_policy,
-                        group::epilogue_policy_default<gpu_arch::Xe>>,
+                        group::epilogue_policy_default<arch_tag>>,
                 elem_v_t<tune_key::dispatch_policy,
                         tune_key_value::dispatch_policy_default>,
                 elem_t_t<tune_key::group_swizzle_policy,
-                        kernel::group_swizzle_default<gpu_arch::Xe>>>;
+                        kernel::group_swizzle_default<arch_tag>>>;
 } // namespace detail
-
+template <gpu_arch arch_tag = gpu_arch::Xe>
 using default_param_t = dict_t<>::template update_dict_t<
         detail::param_dtype_bf16_bf16_bf16>::template update_dict_t<detail::
                 param_memlayout_rrr>::template update_dict_t<detail::
                 param_memalignment_8_8_8>::template update_dict_t<detail::
                 param_memspace_ggg>::template update_dict_t<detail::
                 param_performance_default>::template update_dict_t<detail::
-                param_runtime_default>::
+                param_runtime_default<arch_tag>>::
         template update_t<elem_t_t<tune_key::data_type_acc, float>,
                 elem_v_t<tune_key::global_kslicing_ratio, 1UL, uint32_t>,
                 elem_v_t<tune_key::local_kslicing_ratio, 1UL, uint32_t>,
@@ -76,7 +77,8 @@ using default_param_t = dict_t<>::template update_dict_t<
                         param_optimizer_mode::full, param_optimizer_mode>>;
 
 namespace kernel {
-using param_kslicing_g1l1_t = default_param_t::template update_t<
+template <gpu_arch arch_tag = gpu_arch::Xe>
+using param_kslicing_g1l1_t = default_param_t<arch_tag>::template update_t<
         elem_v_t<tune_key::global_kslicing_ratio, 1UL, uint32_t>,
         elem_v_t<tune_key::local_kslicing_ratio, 1UL, uint32_t>,
         elem_t_t<tune_key::wg_tile_shape, shape<256, 256>>,
@@ -85,7 +87,8 @@ using param_kslicing_g1l1_t = default_param_t::template update_t<
         elem_v_t<tune_key::dispatch_policy,
                 tune_key_value::dispatch_policy_kslicing>>;
 
-using param_kslicing_g2l1_t = default_param_t::template update_t<
+template <gpu_arch arch_tag = gpu_arch::Xe>
+using param_kslicing_g2l1_t = default_param_t<arch_tag>::template update_t<
         elem_v_t<tune_key::global_kslicing_ratio, 2UL, uint32_t>,
         elem_v_t<tune_key::local_kslicing_ratio, 1UL, uint32_t>,
         elem_t_t<tune_key::wg_tile_shape, shape<256, 256>>,
@@ -94,7 +97,8 @@ using param_kslicing_g2l1_t = default_param_t::template update_t<
         elem_v_t<tune_key::dispatch_policy,
                 tune_key_value::dispatch_policy_kslicing>>;
 
-using param_kslicing_g1l2_t = default_param_t::template update_t<
+template <gpu_arch arch_tag = gpu_arch::Xe>
+using param_kslicing_g1l2_t = default_param_t<arch_tag>::template update_t<
         elem_v_t<tune_key::global_kslicing_ratio, 1UL, uint32_t>,
         elem_v_t<tune_key::local_kslicing_ratio, 2UL, uint32_t>,
         elem_t_t<tune_key::wg_tile_shape, shape<128, 64>>,
@@ -106,7 +110,8 @@ using param_kslicing_g1l2_t = default_param_t::template update_t<
 } // namespace kernel
 
 namespace group {
-using param_dict1_wg_t = default_param_t::template update_t<
+template <gpu_arch arch_tag = gpu_arch::Xe>
+using param_dict1_wg_t = default_param_t<arch_tag>::template update_t<
         elem_t_t<tune_key::data_type_acc, float>,
         elem_t_t<tune_key::wg_tile_shape, shape<256, 256>>,
         elem_v_t<tune_key::wg_tile_k, 32UL, uint32_t>,
@@ -114,6 +119,6 @@ using param_dict1_wg_t = default_param_t::template update_t<
         elem_v_t<tune_key::prefetch_distance, 3UL, uint32_t>,
         elem_v_t<tune_key::periodic_sync_interval, 8UL, uint32_t>,
         elem_t_t<tune_key::epilogue_policy,
-                group::epilogue_policy_default<gpu_arch::Xe>>>;
+                group::epilogue_policy_default<arch_tag>>>;
 }
 } // namespace gpu::xetla
diff --git a/include/kernel/gemm/impl/default_xe.hpp b/include/kernel/gemm/impl/default_xe.hpp
index 93c949e09..78c63ffbd 100644
--- a/include/kernel/gemm/impl/default_xe.hpp
+++ b/include/kernel/gemm/impl/default_xe.hpp
@@ -176,8 +176,8 @@ class gemm_universal_t<dispatch_policy_default<group_swizzle_>, gemm_t_,
     /// @return The size of local memory required.
     __XETLA_API static constexpr uint32_t get_slm_size() {
         constexpr uint32_t size = gemm_t::slm_size + epilogue_t::slm_size;
-        static_assert(size <= (128 * 1024),
-                "The local memory size should be less than 128KB!");
+        static_assert(size <= arch_attr_t<arch_tag>::local_mem_size,
+                "The local memory size excess!");
         return size;
     };
 
diff --git a/include/kernel/gemm/impl/kslicing_xe.hpp b/include/kernel/gemm/impl/kslicing_xe.hpp
index 87dd795f8..6415c39c1 100644
--- a/include/kernel/gemm/impl/kslicing_xe.hpp
+++ b/include/kernel/gemm/impl/kslicing_xe.hpp
@@ -239,8 +239,8 @@ class gemm_universal_t<dispatch_policy_kslicing<group_swizzle_,
     __XETLA_API static constexpr uint32_t get_slm_size() {
         constexpr uint32_t size = gemm_slm_size * num_local_kslicing
                 + kslicing_slm_size + epilogue_slm_size * num_local_kslicing;
-        static_assert(size <= (128 * 1024),
-                "The local memory size should be less than 128KB!");
+        static_assert(size <= arch_attr_t<arch_tag>::local_mem_size,
+                "The local memory size excess!");
         return size;
     }
 
diff --git a/include/kernel/gemm/impl/stream_k_xe.hpp b/include/kernel/gemm/impl/stream_k_xe.hpp
index 620242062..eec3948d9 100644
--- a/include/kernel/gemm/impl/stream_k_xe.hpp
+++ b/include/kernel/gemm/impl/stream_k_xe.hpp
@@ -217,8 +217,8 @@ class gemm_universal_t<dispatch_policy_stream_k<gpu_arch::Xe>, gemm_t_,
     /// @return The size of local memory required.
     __XETLA_API static constexpr uint32_t get_slm_size() {
         constexpr uint32_t size = gemm_t::slm_size + epilogue_t::slm_size;
-        static_assert(size <= (128 * 1024),
-                "The local memory size should be less than 128KB!");
+        static_assert(size <= arch_attr_t<arch_tag>::local_mem_size,
+                "The local memory size excess!");
         return size;
     };
 
diff --git a/media/docs/construct_a_gemm.md b/media/docs/construct_a_gemm.md
index a5a3cbf17..ef54b287a 100644
--- a/media/docs/construct_a_gemm.md
+++ b/media/docs/construct_a_gemm.md
@@ -6,16 +6,16 @@ As shown in the diagram below, each workgroup will calculate a sub-matrix, repre
 
 ![ALT](/media/docs/dom.jpg "GEMM decomposition by workgroup and subgroup")
 
-## Basic Components  
+## Basic Components
 
 1. Select a `GEMM building block`, considering the division of work-group and sub-group
-2. Decide if `splitK` or `steamK` is needed in specific shape 
+2. Decide if `splitK` or `steamK` is needed in specific shape
 3. Define `epilogue` that specifies what you want to fuse after the GEMM computation based on accumulator
 4. Instantiate a `gemm` implementation by the selections from 1)-3).
 
 For a runnable code example, you can refer to the code in the [02_basic_gemm](/examples/02_basic_gemm).
 
-### Task Mapping 
+### Task Mapping
 Before launching the GPU kernel, it is crucial to determine how to map the entire GEMM computation onto the GPU, considering work-group and sub-group configurations. Efficiently utilizing GPU resources requires careful consideration of factors such as the operation's shape, data type, and the hardware specifications of the GPU. A typical configuration for workgroups and subgroups may resemble the example below, especially when the input shape is sufficient to fully utilize the GPU.
 
 ```c++
@@ -64,7 +64,7 @@ Alternatively, the subgroup-level splitK is also available i which can accumulat
 
 ![ALT](/media/docs/subgroup_splitK.jpg "split K in subgroup level")
 
-For kernel level API, we can set two parameters in dispatch policy of `gemm_universal` API. Definitely, you can set both value to large than 1 for mixing workgroup and subgroup level split K together. 
+For kernel level API, we can set two parameters in dispatch policy of `gemm_universal` API. Definitely, you can set both value to large than 1 for mixing workgroup and subgroup level split K together.
 
 ```c++
  using dispatch_policy
@@ -87,7 +87,7 @@ decide the location of input and output matrix which is either from global or sh
             mem_space::global, // memory reading from global mem for B
             8, // buffer alignment for A, in unit of element
             8, // buffer alignment for B, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             tile_shape, // computation tile shape
             sg_tile_k, // elements in each iteration
             mma_engine::xmx, // compute engine
@@ -122,7 +122,7 @@ class epilogue_t {};
 - `tile_shape` is the problem size of each group and subgroup.
 - `mem_desc_c` is the description of buffer `c`, which includes `memory data type`, `memory space` and `memory layout`...
 
-In example [03_gemm_relu_bias](/examples/03_gemm_relu_bias), a chain of operations is effectively fused into the GEMM computation. 
+In example [03_gemm_relu_bias](/examples/03_gemm_relu_bias), a chain of operations is effectively fused into the GEMM computation.
 First, using pre-defined post-operations `relu` and `bias_add`, and then pass it to `epilogue_policy::tile_op_t`.
 
 ```c++
@@ -132,7 +132,7 @@ using tile_op_t = chained_tile_op_t<
                   >;
 ```
 
-### GEMM Instantiate 
+### GEMM Instantiate
 
 After configuration of BRGEMM and epilogue, it's simple to build entire GEMM with:
 - assigning tasks to each group, setting working boundaries and starting position accordingly.
@@ -153,7 +153,7 @@ Finally, the actual data will be passed using gemm_op_t::arguments_t, and all of
 typename gemm_op_t::arguments_t arg(matrix_n, matrix_k,
                      matrix_m, A, matrix_k, B, matrix_n, C, matrix_n);
 ```
-```c++ 
+```c++
 gemm_op_t gemm_op;
 
 gemm_op(item, arg);
diff --git a/tests/integration/default_config/group_gemm/kernel_func.hpp b/tests/integration/default_config/group_gemm/kernel_func.hpp
index 13c91ba1f..aecaf6fd9 100644
--- a/tests/integration/default_config/group_gemm/kernel_func.hpp
+++ b/tests/integration/default_config/group_gemm/kernel_func.hpp
@@ -34,11 +34,11 @@ struct default_config_group_gemm_test_func {
     // should larger than 8
     static constexpr uint32_t k_stride = sg_k;
 
-    // Step 1: define mirco-kernel's configuration
+    // Step 1: define Micro-kernel's configuration
     using wg_shape = shape<wg_n, wg_m>;
     using sg_shape = shape<sg_n, sg_m>;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using gemm_tune_option = dict_t<
             elem_v_t<tune_key::param_optimizer_type,
                     tune_key_value::param_optimizer_decision_tree>,
@@ -59,7 +59,7 @@ struct default_config_group_gemm_test_func {
             layout_b, // memory layout for B
             8, // leading dimension alignment for B, in unit of element
             mem_space::global, // memory reading from global mem for B
-            dtype_acc, // accumulator data type for intermediate resutls
+            dtype_acc, // accumulator data type for intermediate results
             wg_shape, // computation tile shape
             k_stride, // elements in each iteration
             gpu_arch::Xe, // GPU arch
diff --git a/tests/integration/default_config/kernel_gemm/kernel_func.hpp b/tests/integration/default_config/kernel_gemm/kernel_func.hpp
index 84e9ab1b3..f16a50c97 100644
--- a/tests/integration/default_config/kernel_gemm/kernel_func.hpp
+++ b/tests/integration/default_config/kernel_gemm/kernel_func.hpp
@@ -48,7 +48,7 @@ struct default_config_kernel_gemm_test_func {
             dtype_c, // output datatype for C
             mem_layout::row_major, // memory layout for C
             8, // leading dimension alignment for C, in unit of element
-            dtype_acc, // accumulator data type for intermediate resutls
+            dtype_acc, // accumulator data type for intermediate results
             gpu_arch::Xe, // GPU arch
             tune_option>;
 
diff --git a/tests/integration/gemm/bf16_stream_k/main.cpp b/tests/integration/gemm/bf16_stream_k/main.cpp
index df3dfc54b..55b3570b9 100644
--- a/tests/integration/gemm/bf16_stream_k/main.cpp
+++ b/tests/integration/gemm/bf16_stream_k/main.cpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include <utils/utils.hpp>
 #include "xetla.hpp"
+#include <utils/utils.hpp>
 
 using namespace gpu::xetla;
 //The number of times the kernel is executed
@@ -245,7 +245,7 @@ void stream_k_gemm_run(uint32_t iter) {
     static constexpr uint32_t periodic_sync_interval = 4;
     static constexpr uint32_t prefetch_distance = 4;
 
-    // Mirco-kernel configuration
+    // Micro-kernel configuration
     using gemm_config = typename xetla::group::gemm_selector_t<
             data_type_a, // input datatype for A
             data_type_b, // input datatype for B
@@ -255,7 +255,7 @@ void stream_k_gemm_run(uint32_t iter) {
             mem_space::global, // memory reading from global mem for B
             8, // leading dimension for A, in unit of element
             8, // leading dimension for B, in unit of element
-            data_type_acc, // accumulator data type for intermediate resutls
+            data_type_acc, // accumulator data type for intermediate results
             tile_shape, // computation tile shape
             sg_tile_k, // elements in each iteration
             mma_engine::xmx, // compute engine
@@ -299,9 +299,11 @@ void stream_k_gemm_run(uint32_t iter) {
             gemm_config::k_stride, wg_tile_n, sg_tile_m, sg_tile_n,
             avail_xecores);
 
-
-    static const std::string env_set_str = "SYCL_PROGRAM_COMPILE_OPTIONS= -vc-codegen -doubleGRF -vc-disable-indvars-opt -Xfinalizer ' -printregusage -enableBCR -DPASTokenReduction '";
-    putenv(const_cast<char*>(env_set_str.c_str()));
+    static const std::string env_set_str
+            = "SYCL_PROGRAM_COMPILE_OPTIONS= -vc-codegen -doubleGRF "
+              "-vc-disable-indvars-opt -Xfinalizer ' -printregusage -enableBCR "
+              "-DPASTokenReduction '";
+    putenv(const_cast<char *>(env_set_str.c_str()));
     //Define and initialize the data required for the calculation
     auto A = alloc_device_and_init<data_type_a>(
             size_a,
@@ -434,7 +436,7 @@ void stream_k_gemm_run(uint32_t iter) {
     }
 
     static const std::string env_unset_str = "SYCL_PROGRAM_COMPILE_OPTIONS=";
-    putenv(const_cast<char*>(env_unset_str.c_str()));
+    putenv(const_cast<char *>(env_unset_str.c_str()));
 
     ASSERT_EQ(0,
             gemm_result_validate(A, B, C, Bias, matrix_m, matrix_k, matrix_n,
diff --git a/tests/utils/execution.hpp b/tests/utils/execution.hpp
index 66519472e..06f7886e3 100644
--- a/tests/utils/execution.hpp
+++ b/tests/utils/execution.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <iomanip>
+#include <stdexcept>
 #include "common.hpp"
 #include "profiling.hpp"
 #include "xetla.hpp"
@@ -243,10 +244,8 @@ class dispatch_arch {
         fmt_bak.copyfmt(std::cout);
 
         sycl::device device;
-        if (!device.has(aspect::ext_intel_device_id)) {
-            std::cout << "Can not get device ID\n";
-            return;
-        }
+        if (!device.has(aspect::ext_intel_device_id))
+            throw std::runtime_error("Can not get device ID");
         auto deviceID = device.get_info<ext::intel::info::device::device_id>();
         std::cout << "deviceID: 0x" << std::hex //
                   << std::right << std::setfill('0') << deviceID << "\n";
@@ -266,14 +265,13 @@ class dispatch_arch {
             case ENS::architecture::intel_gpu_dg2_g11:
             case ENS::architecture::intel_gpu_dg2_g12:
                 return F<gpu_arch::Dg2>::exec(std::forward<Args>(args)...);
-                return;
             default: break;
         }
 
 #endif
-        std::cout << "No maching architecture, checking device ID ...\n";
+        std::cout << "No matching architecture, checking device ID ...\n";
         switch (deviceID) {
-            // DG2 devices: https://gfxspecs.intel.com/Predator/Home/Index/44477
+            // DG2 devices
             case 0x56a0: // Intel® Arc ™ A770 Graphics
             case 0x56a1: // Intel® Arc ™ A750 Graphics
             case 0x56a2: // Intel® Arc ™ A580 Graphics
@@ -281,10 +279,22 @@ class dispatch_arch {
             case 0x5691: // Intel® Arc ™ A730M Graphics
             case 0x5692: // Intel® Arc ™ A550M Graphics
                 return F<gpu_arch::Dg2>::exec(std::forward<Args>(args)...);
-            // PVC devices: https://gfxspecs.intel.com/Predator/Home/Index/44484
+            // PVC devices
             case 0x0bda: //
                 return F<gpu_arch::Xe>::exec(std::forward<Args>(args)...);
-            default: std::cout << "Unknown device ID \n"; return;
+            default: std::cout << "Unknown device ID \n"; break;
+        }
+
+        if (device.has(aspect::ext_intel_gpu_eu_simd_width))
+            throw std::runtime_error("Can not get eu_simd_width");
+        auto eu_simd_width = device.get_info<
+                ext::intel::info::device::gpu_eu_simd_width>();
+        if (eu_simd_width == 8) {
+            return F<gpu_arch::Dg2>::exec(std::forward<Args>(args)...);
+        } else if (eu_simd_width == 16) {
+            return F<gpu_arch::Xe>::exec(std::forward<Args>(args)...);
+        } else {
+            throw std::runtime_error("Can not get device ID");
         }
     }
 };

From 5c83bb19591b1f9f6af967e4f45c58560d6b5984 Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Fri, 8 Mar 2024 01:14:57 +0000
Subject: [PATCH 07/11] add some doc

---
 CMakeLists.txt                           |  2 +-
 include/kernel/default_config/common.hpp |  6 +++++-
 tests/utils/execution.hpp                | 14 +++++++++++---
 3 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a6979374..d305e5a65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,7 +46,7 @@ endif ()
 add_compile_options(-fsycl)
 add_link_options(-fsycl)
 if(UNIX)
-    add_compile_options(-fp-model=precise -Wall -Wextra )
+    add_compile_options(-fp-model=precise -Wall -Wextra -Werror)
     add_link_options(-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -lm)
     link_libraries(-lgtest -lgtest_main)
 else() # Windows
diff --git a/include/kernel/default_config/common.hpp b/include/kernel/default_config/common.hpp
index 046eb3237..31cacfde5 100644
--- a/include/kernel/default_config/common.hpp
+++ b/include/kernel/default_config/common.hpp
@@ -98,7 +98,11 @@ enum class tune_key_value : uint8_t {
 // parameter optimizer
 
 enum class param_optimizer_tag : uint8_t { kernel, work_group };
-enum class param_optimizer_mode : uint8_t { full, keep_shape };
+// optimizer_mode (currently only useful with param_optimizer_decision_tree)
+enum class param_optimizer_mode : uint8_t {
+    full, // optimize all available options
+    keep_shape, // optimize all execept keepping the original wg/sg tile shape
+};
 
 template <param_optimizer_tag tag_, typename dict_t_>
 struct param_optimizer;
diff --git a/tests/utils/execution.hpp b/tests/utils/execution.hpp
index 06f7886e3..dd30d2756 100644
--- a/tests/utils/execution.hpp
+++ b/tests/utils/execution.hpp
@@ -168,12 +168,15 @@ void gemm_exec(const std::string &compile_str, size_t batch = 1) {
     }
 }
 
-/// @brief The template function to execute kernel in esimd way for unit test framework
+/// @brief The template function to execute kernel in esimd way for unit test
+/// framework
 ///
-/// @tparam data_type data_type The data type of buffer used in kernel and buffer allocation
+/// @tparam data_type data_type The data type of buffer used in kernel and
+/// buffer allocation
 /// @tparam KERNEL the kernel function struct
 /// @param nd_range the range of workitems
-/// @param validate_result validation function, taking 3 parameters buffer A, B as input C as output
+/// @param validate_result validation function, taking 3 parameters buffer A, B
+/// as input C as output
 ///
 template <typename data_type, class KERNEL, size_t SLMSIZE = 8 * 1024,
         size_t BARNUM = 32, size_t Size = 4096>
@@ -232,6 +235,11 @@ void kernel_run(auto nd_range, auto validate_result) {
     free(C_host);
 }
 
+/// @brief Using gpu_arch of current machine to run F<arch>::exec
+///
+/// @tparam F The gpu_arch-templated function wrapper
+///
+/// @example example usage in /examples/01 or /examples/02
 template <template <gpu_arch> class F>
 class dispatch_arch {
     using T_RET = std::invoke_result_t<decltype(F<gpu_arch::Xe>::exec)>;

From 8f5cd91660c5c35b26d59dbe1a2588b019a88bcc Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Tue, 12 Mar 2024 08:17:01 +0000
Subject: [PATCH 08/11] global mem desc alignment & use arch_config

---
 .../scaled_dot_product_attention.cpp          | 28 ++++++++++++-------
 .../group/gemm/compute_policy.hpp             |  2 +-
 include/group/gemm/compute_policy.hpp         |  5 ++--
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
index de752a855..59ecea9f8 100644
--- a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -152,11 +152,15 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
     constexpr uint32_t matrix_n_qk = sequence_len;
     constexpr uint32_t matrix_k_qk = head_size;
 
-    constexpr uint32_t wg_tile_m_qksv = arch_tag == gpu_arch::Xe ? 64 : 32;
+    constexpr double slm_ratio_to_pvc
+            = static_cast<double>(arch_attr_t<arch_tag>::local_mem_size)
+            / arch_attr_t<gpu_arch::Xe>::local_mem_size;
+
+    constexpr uint32_t wg_tile_m_qksv = 64 * slm_ratio_to_pvc;
 
     constexpr uint32_t wg_tile_m_qk = wg_tile_m_qksv;
     constexpr uint32_t wg_tile_n_qk = 512; // must == sl_kv
-    constexpr uint32_t sg_tile_m_qk = arch_tag == gpu_arch::Xe ? 32 : 16;
+    constexpr uint32_t sg_tile_m_qk = 32 * slm_ratio_to_pvc;
     constexpr uint32_t sg_tile_n_qk = 32;
     constexpr uint32_t wg_tile_k_qk = 32;
 
@@ -169,7 +173,7 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
     constexpr uint32_t wg_tile_m_sv = wg_tile_m_qksv;
     constexpr uint32_t wg_tile_n_sv = 64; // must == head_dim
     constexpr uint32_t sg_tile_m_sv = 8;
-    constexpr uint32_t sg_tile_n_sv = arch_tag == gpu_arch::Xe ? 16 : 8;
+    constexpr uint32_t sg_tile_n_sv = 16 * slm_ratio_to_pvc;
     constexpr uint32_t wg_tile_k_sv = 32;
 
     // buffer size of softmax row data
@@ -290,12 +294,14 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
                     using gemm0_t = xetla::group::default_gemm_selector_t<
                             dtype_in, // input datatype for A
                             mem_layout::row_major, // memory layout for A
-                            8, // leading dimension for A, in unit of element
+                            // alignment for A, in unit of element
+                            DEVICE_MEM_ALIGNMENT / sizeof(dtype_in),
                             mem_space::
                                     global, // memory reading from global mem for A
                             dtype_in, // input datatype for B
                             mem_layout::row_major, // memory layout for B
-                            8, // leading dimension for B, in unit of element
+                            // alignment for B, in unit of element
+                            DEVICE_MEM_ALIGNMENT / sizeof(dtype_in),
                             mem_space::
                                     global, // memory reading from global mem for B
                             float, // accumulator data type for intermediate results
@@ -306,7 +312,7 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
                     using epilogue0_t = xetla::group::default_epilogue_selector_t<
                             dtype_sfx, // onput datatype for C
                             mem_layout::row_major, // memory layout for C
-                            8, // leading dimension for C, in unit of element
+                            8, // alignment for C, in unit of element
                             mem_space::
                                     local, // memory writing to local mem for C
                             wg_shape0, // computation tile shape
@@ -382,12 +388,13 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
                     using gemm1_t = xetla::group::default_gemm_selector_t<
                             dtype_in, // input datatype for A
                             mem_layout::row_major, // memory layout for A
-                            8, // leading dimension for A, in unit of element
+                            8, // alignment for A, in unit of element
                             mem_space::
                                     local, // memory reading from local mem for A
                             dtype_in, // input datatype for B
                             mem_layout::row_major, // memory layout for B
-                            8, // leading dimension for B, in unit of element
+                            // alignment for B, in unit of element
+                            DEVICE_MEM_ALIGNMENT / sizeof(dtype_in),
                             mem_space::
                                     global, // memory reading from global mem for B
                             float, // accumulator data type for intermediate results
@@ -402,8 +409,9 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
                     using work_group_t = typename gemm1_t::work_group_t;
                     using mem_desc_a_t = typename gemm1_t::mem_desc_a_t;
                     using mem_desc_b_t = typename gemm1_t::mem_desc_b_t;
-                    using mem_desc_c_t = mem_desc_t<dtype_out,
-                            mem_layout::row_major, mem_space::global>;
+                    using mem_desc_c_t = mem_desc_t< //
+                            dtype_out, mem_layout::row_major, mem_space::global,
+                            DEVICE_MEM_ALIGNMENT / sizeof(dtype_out)>;
                     // Using gemm::matAcc init a matC class for future storage
                     using matAcc_t = typename gemm1_t::matAcc_t;
                     using matC_t = tile_t<dtype_out,
diff --git a/include/experimental/group/gemm/compute_policy.hpp b/include/experimental/group/gemm/compute_policy.hpp
index 460ec110c..11d2c0e39 100644
--- a/include/experimental/group/gemm/compute_policy.hpp
+++ b/include/experimental/group/gemm/compute_policy.hpp
@@ -58,7 +58,7 @@ struct compute_policy_int4_dequantize_xmx<compute_attr_, perf_tuning_knob_,
     static constexpr bool is_int4_matB_policy = true;
 
     static constexpr uint32_t block_size_x_b
-            = arch_tag == gpu_arch::Dg2 ? 8 : 16;
+            = arch_attr_t<arch_tag>::mma_attr::mma_n_in_elem;
     static constexpr uint32_t block_bytes_y_b = 32;
     static_assert(block_bytes_x_a == block_bytes_y_b,
             "mat_a x need to match with mat_b y");
diff --git a/include/group/gemm/compute_policy.hpp b/include/group/gemm/compute_policy.hpp
index aa4343a28..338fc46d5 100644
--- a/include/group/gemm/compute_policy.hpp
+++ b/include/group/gemm/compute_policy.hpp
@@ -54,7 +54,8 @@ struct compute_policy_default_xmx<compute_attr_, perf_tuning_knob_, arch_tag_,
             = block_bytes_x_a / sizeof(dtype_mma_a);
     static constexpr uint32_t block_size_y_a = 16;
 
-    static constexpr uint32_t block_size_x_b = arch_tag < gpu_arch::Xe ? 8 : 16;
+    static constexpr uint32_t block_size_x_b
+            = arch_attr_t<arch_tag>::mma_attr::mma_n_in_elem;
     static constexpr uint32_t block_bytes_y_b = 32;
     static constexpr uint32_t block_size_y_b
             = block_bytes_y_b / sizeof(dtype_mma_b);
@@ -91,7 +92,7 @@ struct compute_policy_unaligned_xmx<compute_attr_, perf_tuning_knob_, arch_tag_,
     static constexpr uint32_t block_size_y_a = 16;
 
     static constexpr uint32_t block_size_x_b
-            = arch_tag == gpu_arch::Dg2 ? 8 : 16;
+            = arch_attr_t<arch_tag>::mma_attr::mma_n_in_elem;
     static constexpr uint32_t block_bytes_y_b = 32;
     static constexpr uint32_t block_size_y_b
             = block_bytes_y_b / sizeof(dtype_mma_b);

From 540373ab42aba999305b49fc74d1a7b79383af6c Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Tue, 12 Mar 2024 08:50:00 +0000
Subject: [PATCH 09/11] param_optimizer_mode => param_optimizer_level

---
 .../scaled_dot_product_attention.cpp                          | 4 ++--
 include/kernel/default_config/common.hpp                      | 4 ++--
 include/kernel/default_config/decision_tree_policy.hpp        | 4 ++--
 include/kernel/gemm/default_gemm.hpp                          | 4 ++--
 include/kernel/gemm/gemm_preset.hpp                           | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
index 59ecea9f8..2a4ac852f 100644
--- a/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
+++ b/examples/08_scaled_dot_product_attention/scaled_dot_product_attention.cpp
@@ -277,8 +277,8 @@ void sdp_fwd_run(uint32_t iter, uint32_t warmup = 10) {
                     using group_swizzle = group_swizzle_default<arch_tag>;
 
                     using elem_opt_mode_t
-                            = elem_v_t<tune_key::param_optimizer_mode,
-                                    param_optimizer_mode::keep_shape>;
+                            = elem_v_t<tune_key::param_optimizer_level,
+                                    param_optimizer_level::keep_shape>;
                     using elem_opt_type_t = elem_v_t<
                             tune_key::param_optimizer_type,
                             tune_key_value::param_optimizer_decision_tree>;
diff --git a/include/kernel/default_config/common.hpp b/include/kernel/default_config/common.hpp
index 31cacfde5..283388ed0 100644
--- a/include/kernel/default_config/common.hpp
+++ b/include/kernel/default_config/common.hpp
@@ -52,7 +52,7 @@ enum class tune_key : uint8_t {
     dispatch_policy,
     group_swizzle_policy,
     param_optimizer_type,
-    param_optimizer_mode,
+    param_optimizer_level,
     source_location
 };
 template <typename T>
@@ -99,7 +99,7 @@ enum class tune_key_value : uint8_t {
 
 enum class param_optimizer_tag : uint8_t { kernel, work_group };
 // optimizer_mode (currently only useful with param_optimizer_decision_tree)
-enum class param_optimizer_mode : uint8_t {
+enum class param_optimizer_level : uint8_t {
     full, // optimize all available options
     keep_shape, // optimize all execept keepping the original wg/sg tile shape
 };
diff --git a/include/kernel/default_config/decision_tree_policy.hpp b/include/kernel/default_config/decision_tree_policy.hpp
index f9d89fbd5..e84ca2bfe 100644
--- a/include/kernel/default_config/decision_tree_policy.hpp
+++ b/include/kernel/default_config/decision_tree_policy.hpp
@@ -280,13 +280,13 @@ struct fallback_optimizer {
 };
 
 template <param_optimizer_tag tag_, typename dict_t_,
-        param_optimizer_mode mode_, typename... candidates_t>
+        param_optimizer_level mode_, typename... candidates_t>
 struct decision_tree_optimizer : param_optimizer_base {
     struct impl {
         template <typename T, template <typename> typename G>
         using apply_handeler = T::template update_generator_t<G>;
         static constexpr bool keep_shape
-                = (mode_ == param_optimizer_mode::keep_shape);
+                = (mode_ == param_optimizer_level::keep_shape);
 
         using t0 = dict_t_;
         using t1 = apply_handeler<t0, decision_tree_rule::data_type_handler>;
diff --git a/include/kernel/gemm/default_gemm.hpp b/include/kernel/gemm/default_gemm.hpp
index 6adbb264c..3cbd49c31 100644
--- a/include/kernel/gemm/default_gemm.hpp
+++ b/include/kernel/gemm/default_gemm.hpp
@@ -79,7 +79,7 @@ struct param_optimizer<param_optimizer_tag::kernel, dict_t_> {
             ? dict_t_::template find_elem_v<tune_key::gpu_arch>
             : gpu_arch::Xe;
     static constexpr auto optimizer_mode
-            = dict_t_::template find_elem_v<tune_key::param_optimizer_mode>;
+            = dict_t_::template find_elem_v<tune_key::param_optimizer_level>;
     using type = typename std::conditional<use_rule,
             decision_tree_optimizer<param_optimizer_tag::kernel, dict_t_,
                     optimizer_mode>,
@@ -208,7 +208,7 @@ struct param_optimizer<param_optimizer_tag::work_group, dict_t_> {
             && (dict_t_::template find_elem_v<tune_key::
                                 param_optimizer_type> == tune_key_value::param_optimizer_decision_tree);
     static constexpr auto optimizer_mode
-            = dict_t_::template find_elem_v<tune_key::param_optimizer_mode>;
+            = dict_t_::template find_elem_v<tune_key::param_optimizer_level>;
     static constexpr auto arch_tag
             = (dict_t_::impl::template find_elem_index<
                        tune_key::gpu_arch> != dict_t_::impl::key_not_found)
diff --git a/include/kernel/gemm/gemm_preset.hpp b/include/kernel/gemm/gemm_preset.hpp
index 5e8bfc1dd..5afeef300 100644
--- a/include/kernel/gemm/gemm_preset.hpp
+++ b/include/kernel/gemm/gemm_preset.hpp
@@ -73,8 +73,8 @@ using default_param_t = dict_t<>::template update_dict_t<
                 elem_t_t<tune_key::sg_tile_shape, shape<64, 32>>,
                 elem_v_t<tune_key::param_optimizer_type,
                         tune_key_value::param_optimizer_dummy>,
-                elem_v_t<tune_key::param_optimizer_mode,
-                        param_optimizer_mode::full, param_optimizer_mode>>;
+                elem_v_t<tune_key::param_optimizer_level,
+                        param_optimizer_level::full, param_optimizer_level>>;
 
 namespace kernel {
 template <gpu_arch arch_tag = gpu_arch::Xe>

From f1f010dfc95f2d68408cd5439d3ff460f7f29ab1 Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Tue, 12 Mar 2024 09:36:16 +0000
Subject: [PATCH 10/11] fix use of (un)named barrier on non-PVC device

---
 include/common/utils/raw_send_nbarrier.hpp   | 13 ++++++++++---
 include/group/gemm/impl/default_fpu_xe.hpp   |  6 ++++--
 include/group/gemm/impl/default_xmx_xe.hpp   |  6 ++++--
 include/group/gemm/impl/unaligned_xmx_xe.hpp |  8 ++++----
 include/kernel/default_config/common.hpp     |  2 +-
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/common/utils/raw_send_nbarrier.hpp b/include/common/utils/raw_send_nbarrier.hpp
index 7bde822b0..13bc45426 100644
--- a/include/common/utils/raw_send_nbarrier.hpp
+++ b/include/common/utils/raw_send_nbarrier.hpp
@@ -111,15 +111,22 @@ struct xetla_nbarrier_t<num_producers, num_consumers, arch_tag,
 
     /// @brief Generic work-group split barrier.
     ///
-    __XETLA_API void arrive() { __ESIMD_NS::barrier(); }
+    __XETLA_API void arrive() {
+        __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::signal>();
+    }
 
     /// @brief named barrier wait within subgroup.
     ///
-    __XETLA_API void wait() { __ESIMD_NS::barrier(); }
+    __XETLA_API void wait() {
+        __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::wait>();
+    }
 
     /// @brief named barrier signal from subgroup.
     ///
-    __XETLA_API void arrive_wait() { __ESIMD_NS::barrier(); }
+    __XETLA_API void arrive_wait() {
+        arrive();
+        wait();
+    }
 };
 
 /// @} xetla_util_named_barrier
diff --git a/include/group/gemm/impl/default_fpu_xe.hpp b/include/group/gemm/impl/default_fpu_xe.hpp
index 73e57e860..33cfa0f43 100644
--- a/include/group/gemm/impl/default_fpu_xe.hpp
+++ b/include/group/gemm/impl/default_fpu_xe.hpp
@@ -310,7 +310,8 @@ class gemm_t<
             if constexpr (enable_periodic_sync) {
                 if ((i % sync_freq) == 0) {
                     if constexpr (wg_size_x > 1) { nbarrier_a.arrive(); }
-                    if constexpr (wg_size_y > 1) { nbarrier_b.arrive(); }
+                    if constexpr (arch_tag >= gpu_arch::Xe)
+                        if constexpr (wg_size_y > 1) { nbarrier_b.arrive(); }
                 }
             }
             SW_BARRIER();
@@ -343,7 +344,8 @@ class gemm_t<
             if constexpr (enable_periodic_sync) {
                 if ((i % sync_freq) == 0) {
                     if constexpr (wg_size_x > 1) { nbarrier_a.wait(); }
-                    if constexpr (wg_size_y > 1) { nbarrier_b.wait(); }
+                    if constexpr (arch_tag >= gpu_arch::Xe)
+                        if constexpr (wg_size_y > 1) { nbarrier_b.wait(); }
                 }
             }
         }
diff --git a/include/group/gemm/impl/default_xmx_xe.hpp b/include/group/gemm/impl/default_xmx_xe.hpp
index c0d51e236..832b5d4a8 100644
--- a/include/group/gemm/impl/default_xmx_xe.hpp
+++ b/include/group/gemm/impl/default_xmx_xe.hpp
@@ -310,7 +310,8 @@ class gemm_t<
             if constexpr (enable_periodic_sync) {
                 if ((i % sync_freq) == 0) {
                     if constexpr (wg_size_x > 1) { nbarrier_a.arrive(); }
-                    if constexpr (wg_size_y > 1) { nbarrier_b.arrive(); }
+                    if constexpr (arch_tag >= gpu_arch::Xe)
+                        if constexpr (wg_size_y > 1) { nbarrier_b.arrive(); }
                 }
             }
             subgroup::tile_load<cache_hint::cached, cache_hint::cached>(
@@ -346,7 +347,8 @@ class gemm_t<
             if constexpr (enable_periodic_sync) {
                 if ((i % sync_freq) == 0) {
                     if constexpr (wg_size_x > 1) { nbarrier_a.wait(); }
-                    if constexpr (wg_size_y > 1) { nbarrier_b.wait(); }
+                    if constexpr (arch_tag >= gpu_arch::Xe)
+                        if constexpr (wg_size_y > 1) { nbarrier_b.wait(); }
                 }
             }
         }
diff --git a/include/group/gemm/impl/unaligned_xmx_xe.hpp b/include/group/gemm/impl/unaligned_xmx_xe.hpp
index 04d7dcdaf..508f9d082 100755
--- a/include/group/gemm/impl/unaligned_xmx_xe.hpp
+++ b/include/group/gemm/impl/unaligned_xmx_xe.hpp
@@ -373,7 +373,7 @@ class gemm_t<compute_policy_unaligned_xmx<compute_attr_, perf_tuning_knob_,
         matB_payload.template update_tdesc<update_dir_b>(matB_t::tile_size_y);
         xetla_fence<memory_kind::shared_local>();
         nbarrier_a.arrive();
-        nbarrier_b.arrive();
+        if (arch_tag >= gpu_arch::Xe) nbarrier_b.arrive();
 #pragma unroll
         for (uint32_t i = 1; i < num_cyclic - 1; i++) {
             tile_load(partial_matA, matA_payload);
@@ -429,7 +429,7 @@ class gemm_t<compute_policy_unaligned_xmx<compute_attr_, perf_tuning_knob_,
             }
 
             nbarrier_a.wait();
-            nbarrier_b.wait();
+            if (arch_tag >= gpu_arch::Xe) nbarrier_b.wait();
 
             tile_load(matA, matA_local_ld_payload);
             tile_load(matB, matB_local_ld_payload);
@@ -463,7 +463,7 @@ class gemm_t<compute_policy_unaligned_xmx<compute_attr_, perf_tuning_knob_,
             }
 
             nbarrier_a.arrive();
-            nbarrier_b.arrive();
+            if (arch_tag >= gpu_arch::Xe) nbarrier_b.arrive();
             SW_BARRIER();
             matA_acc_t matA_acc;
             matB_acc_t matB_acc;
@@ -498,7 +498,7 @@ class gemm_t<compute_policy_unaligned_xmx<compute_attr_, perf_tuning_knob_,
         }
         SW_BARRIER();
         nbarrier_a.wait();
-        nbarrier_b.wait();
+        if (arch_tag >= gpu_arch::Xe) nbarrier_b.wait();
     }
 
 private:
diff --git a/include/kernel/default_config/common.hpp b/include/kernel/default_config/common.hpp
index 283388ed0..2fb4d334e 100644
--- a/include/kernel/default_config/common.hpp
+++ b/include/kernel/default_config/common.hpp
@@ -101,7 +101,7 @@ enum class param_optimizer_tag : uint8_t { kernel, work_group };
 // optimizer_mode (currently only useful with param_optimizer_decision_tree)
 enum class param_optimizer_level : uint8_t {
     full, // optimize all available options
-    keep_shape, // optimize all execept keepping the original wg/sg tile shape
+    keep_shape, // optimize all except keeping the original wg/sg tile shape
 };
 
 template <param_optimizer_tag tag_, typename dict_t_>

From 2f69d9b8ccb6f0bc719d48a9e8a8abaf0a2627a3 Mon Sep 17 00:00:00 2001
From: "Ding, Yi1" <yi1.ding@intel.com>
Date: Tue, 12 Mar 2024 10:20:05 +0000
Subject: [PATCH 11/11] optimizer_mode => optimizer_level

---
 include/kernel/default_config/common.hpp | 2 +-
 include/kernel/gemm/default_gemm.hpp     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/kernel/default_config/common.hpp b/include/kernel/default_config/common.hpp
index 2fb4d334e..fd8c621a2 100644
--- a/include/kernel/default_config/common.hpp
+++ b/include/kernel/default_config/common.hpp
@@ -98,7 +98,7 @@ enum class tune_key_value : uint8_t {
 // parameter optimizer
 
 enum class param_optimizer_tag : uint8_t { kernel, work_group };
-// optimizer_mode (currently only useful with param_optimizer_decision_tree)
+// optimizer_level (currently only useful with param_optimizer_decision_tree)
 enum class param_optimizer_level : uint8_t {
     full, // optimize all available options
     keep_shape, // optimize all except keeping the original wg/sg tile shape
diff --git a/include/kernel/gemm/default_gemm.hpp b/include/kernel/gemm/default_gemm.hpp
index 3cbd49c31..b56e1ab73 100644
--- a/include/kernel/gemm/default_gemm.hpp
+++ b/include/kernel/gemm/default_gemm.hpp
@@ -78,11 +78,11 @@ struct param_optimizer<param_optimizer_tag::kernel, dict_t_> {
                        tune_key::gpu_arch> != dict_t_::impl::key_not_found)
             ? dict_t_::template find_elem_v<tune_key::gpu_arch>
             : gpu_arch::Xe;
-    static constexpr auto optimizer_mode
+    static constexpr auto optimizer_level
             = dict_t_::template find_elem_v<tune_key::param_optimizer_level>;
     using type = typename std::conditional<use_rule,
             decision_tree_optimizer<param_optimizer_tag::kernel, dict_t_,
-                    optimizer_mode>,
+                    optimizer_level>,
             dummy_optimizer<param_optimizer_tag::kernel, dict_t_,
                     kernel::param_kslicing_g1l1_t<arch_tag>,
                     kernel::param_kslicing_g2l1_t<arch_tag>,
@@ -207,7 +207,7 @@ struct param_optimizer<param_optimizer_tag::work_group, dict_t_> {
                                param_optimizer_type> != dict_t_::impl::key_not_found)
             && (dict_t_::template find_elem_v<tune_key::
                                 param_optimizer_type> == tune_key_value::param_optimizer_decision_tree);
-    static constexpr auto optimizer_mode
+    static constexpr auto optimizer_level
             = dict_t_::template find_elem_v<tune_key::param_optimizer_level>;
     static constexpr auto arch_tag
             = (dict_t_::impl::template find_elem_index<
@@ -216,7 +216,7 @@ struct param_optimizer<param_optimizer_tag::work_group, dict_t_> {
             : gpu_arch::Xe;
     using type = typename std::conditional<use_rule,
             decision_tree_optimizer<param_optimizer_tag::work_group, dict_t_,
-                    optimizer_mode>,
+                    optimizer_level>,
             dummy_optimizer<param_optimizer_tag::work_group, dict_t_,
                     group::param_dict1_wg_t<arch_tag>>>::type::type;
 };