From ad8a477e8928c568cd6e3ed21f1ecf465c9a294f Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 14 May 2024 14:13:38 -0700
Subject: [PATCH 01/42] SG32 #define SG_SZ

---
 sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp                  | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp  | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp            | 2 +-
 sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp               | 2 +-
 sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp               | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp            | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp           | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp    | 2 +-
 .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp         | 2 +-
 .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp       | 2 +-
 .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp  | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp             | 2 +-
 .../Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp   | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp       | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp                 | 2 +-
 .../Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp       | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp            | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp           | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp                 | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp           | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp          | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp              | 2 +-
 30 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
index 182ec8e81233d..4833404610369 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
index 7b9655fe62416..3916aaff03867 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
index e88f0a0a135f5..ddfa39c541c0a 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
index 8a91d404f6948..ad644c8734475 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp
index 06c1f5d3f5c96..06d459a2a3ce5 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp
index 4824ff2568d30..4624110577ea2 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp
@@ -13,6 +13,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 
 #include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
index 3bdd2ed83b08d..9d38fb7afa30d 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
@@ -17,7 +17,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
index 79383fce4b7fc..13d8df56f40a1 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
@@ -17,7 +17,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
index 275e22d5b509f..77949b4eab6d9 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
@@ -19,7 +19,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp
index b9660e73e3ab2..46de02fe8f525 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp
@@ -17,7 +17,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 // Sub-matrix N dimension
 static constexpr size_t SN = 16;
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp
index 0ce3d22bc873b..c38d8f133264d 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_apply_bf16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
index 7040058dc8554..b93985f8e594e 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
@@ -19,7 +19,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
index 3a023df7b10f8..10391f2e7e319 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -16,7 +16,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 1b7a8ed351139..994a2217d681f 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -21,7 +21,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 8c52421657229..4f7e3638daaf3 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -18,7 +18,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
index fc7d0c9e4eba2..2ea58e9953917 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
@@ -20,7 +20,7 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 using bfloat16 = sycl::ext::oneapi::bfloat16;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 06798015261e7..6532bcfe47bff 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -25,7 +25,7 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 using bfloat16 = sycl::ext::oneapi::bfloat16;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
index e2158368ff6f8..70e53441cb48f 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_colA_rowB_colC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp
index 52d8bc9c6f4a4..b474f846d11d5 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp
@@ -13,6 +13,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 
 #include "../joint_matrix_down_convert_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
index cb7b15819f2bb..f4dd217655439 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
@@ -18,7 +18,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
index d7289579098e9..c89c657c77fbc 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -21,7 +21,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_int8_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp
index 09c4d6059750c..c8ee58e126732 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp
@@ -18,7 +18,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_int8_vnni_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
index ed7fb96ca104a..1848a480a0eb7 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 constexpr size_t MATRIX_K = 1024 + 24;
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp
index 6b059ed357781..b193d422c2b8c 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_ss_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp
index 5a13d4c1f1807..cfd89fcb8a1bf 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_su_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp
index 9a82aa8bb647a..18da250bc808d 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
index 504e7beac85e3..214dd10f5158f 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
index 3532e5cc4e3ba..f4b2426af93a8 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 static constexpr size_t MATRIX_K = 1024 + 14;
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp
index a4292269811f1..aec91f70bd1d7 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_us_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp
index 842977311cafa..b2d6510622736 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_uu_int8_impl.hpp"

From cd17776673411bbd8013bb41472e3640b4d97cb0 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 14 May 2024 14:19:53 -0700
Subject: [PATCH 02/42] XMX8 no SG_SZ

---
 sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp                   | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp          | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp          | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp   | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp             | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp    | 1 -
 sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp                | 1 -
 sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp                | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp             | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp            | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp     | 1 -
 .../test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp | 1 -
 .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp        | 1 -
 .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp   | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp              | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp        | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp        | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp        | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp                  | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp             | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp    | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp            | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp            | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp           | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp               | 1 -
 29 files changed, 29 deletions(-)

diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
index aa2d2e28ac468..d7df42000249a 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
index f360bdbba6ada..826b99dfcf306 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
index 6f3aedfe506d5..a39cb6664d100 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
index ca425f7ded5d1..9ff39c8d516d0 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
index b9d49bba70abb..5bae6a3184808 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
@@ -13,7 +13,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
index 2975ab9edf6c4..87adf891cd16b 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
@@ -16,7 +16,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp
index 5aa1cd8a2a0d7..d86af51e3cd86 100644
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp
@@ -18,7 +18,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp
index ece88423d0f43..e815b46e1ed21 100644
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp
@@ -18,7 +18,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
index a84580c3f846c..4c4d6c6eb5765 100644
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp
index be1ac0f24e88c..32b8c3bc6e24f 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t SN = 8;
 
 #include "../joint_matrix_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp
index f02028d31e7ed..614a67db9ff8a 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_apply_bf16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
index b52e8085be172..fbcd21be62f75 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
@@ -15,7 +15,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
index 2e05e656e5379..c5e399bc98f48 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -15,7 +15,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 18238e4896ccb..ba24ea0dfc4b8 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -17,7 +17,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 49b5e6eebb4ac..9d88c89c50f41 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -17,7 +17,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp
index 008db77761e3d..173ac16a42afc 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
index b72e2ed83841c..5a41f19bc2ac1 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bfloat16_32x64_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
index e6371806f3592..09c1a4ae32a92 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 static constexpr int TN = 8;
 
 #include "../joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
index 494a84c173edb..7d74bf8055d6b 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_colA_rowB_colC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp
index dbe060711b02a..419cc936f14e4 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp
index 728a057aedaa7..3dadaeebee511 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_int8_vnni_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
index 532af4dc5d844..07a48bd44fccd 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
@@ -8,7 +8,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t SN = 8;
 
 #include "../joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
index 944cccd310d3e..0ba69032465b9 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 static constexpr size_t MATRIX_K = 1024 + 24;
 
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp
index 4a3770be74f91..fbd97d215498d 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_ss_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp
index d5c7a74c20aff..2694d0135c6a1 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_su_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
index 672e8b87e22e6..a0a98e3f16d0c 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
@@ -13,7 +13,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
index aa8e00c08b658..f42f37378514d 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 constexpr size_t MATRIX_K = 1024 + 14;
 
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp
index 56feaaec924ad..0c5f46f6fcec6 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_us_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp
index a1643332e489f..bc08632463f22 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_uu_int8_impl.hpp"

From ae00144c0d1de9f66de6e8132e7a37f10b6227a0 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 15 May 2024 07:39:06 -0700
Subject: [PATCH 03/42] WIP abc_impl: remove SG_SZ

---
 sycl/test-e2e/Matrix/element_wise_abc.cpp        |  1 -
 sycl/test-e2e/Matrix/element_wise_abc_impl.hpp   | 16 ++++++++++++----
 .../Matrix/element_wise_all_ops_half.cpp         |  1 -
 .../Matrix/element_wise_all_ops_int8.cpp         |  1 -
 .../Matrix/element_wise_all_ops_int8_packed.cpp  |  1 -
 .../Matrix/element_wise_all_ops_tf32.cpp         |  1 -
 sycl/test-e2e/Matrix/element_wise_all_sizes.cpp  |  3 ---
 .../Matrix/elemwise_irreg_size_ops_bf16.cpp      |  2 --
 sycl/test-e2e/Matrix/get_coord_float_matC.cpp    |  1 -
 sycl/test-e2e/Matrix/get_coord_int8_matA.cpp     |  1 -
 sycl/test-e2e/Matrix/get_coord_int8_matB.cpp     |  1 -
 sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp  |  1 -
 .../Matrix/joint_matrix_annotated_ptr.cpp        |  1 -
 sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp |  1 -
 .../Matrix/joint_matrix_bf16_fill_k_cache.cpp    |  1 -
 .../joint_matrix_bf16_fill_k_cache_init.cpp      |  2 +-
 .../joint_matrix_bf16_fill_k_cache_unroll.cpp    |  1 -
 ...oint_matrix_bf16_fill_k_cache_unroll_init.cpp |  1 -
 sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp   |  1 -
 .../Matrix/joint_matrix_bfloat16_array.cpp       |  2 +-
 ...joint_matrix_bfloat16_colmajorA_colmajorB.cpp |  1 -
 .../Matrix/joint_matrix_bfloat16_packedB.cpp     |  2 --
 .../Matrix/joint_matrix_colA_rowB_colC.cpp       |  1 -
 .../Matrix/joint_matrix_down_convert.cpp         |  3 ---
 sycl/test-e2e/Matrix/joint_matrix_half.cpp       |  1 -
 .../joint_matrix_int8_colmajorA_colmajorB.cpp    |  1 -
 sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp  |  1 -
 .../Matrix/joint_matrix_opt_kernel_feature.cpp   |  1 -
 sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp |  1 -
 sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp   |  1 -
 .../Matrix/joint_matrix_rowmajorA_rowmajorB.cpp  |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp    |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp    |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_tf32.cpp       |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp |  1 -
 .../test-e2e/Matrix/joint_matrix_unaligned_k.cpp |  1 -
 sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp    |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp    |  4 ----
 38 files changed, 14 insertions(+), 65 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/element_wise_abc.cpp
index c9954fee4f898..0a6a4e4abaa03 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index bf8b2ecb4df85..8c08bfad7a867 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -15,6 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TM 8
 #define TK 32
+class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
@@ -27,14 +28,21 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  std::cout << "Artem: before get_sg_size()\n";
+  size_t sg_size = get_sg_size<add>(q);
+  std::cout << "Artem: after get_sg_size()\n";
   q.submit([&](handler &cgh) {
      accessor accC{bufC, cgh};
      accessor accA{bufA, cgh};
      accessor accB{bufB, cgh};
 
      cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -60,14 +68,14 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_b,
                accB.template get_multi_ptr<access::decorated::no>() +
-                   sg_starty / SG_SZ * TN * vnniFactor,
+                   sg_starty / sg_size * TN * vnniFactor,
                N * vnniFactor);
            joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; });
 
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; });
          }); // parallel for
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
index fae692ff39ed9..c07d19ed73f2e 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
index 93ddcefc19ac3..e1a2cf4eecfa1 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
index 2d79d945e8980..24f82f47e8fcd 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp
index 28483b5c2092e..6e2f8dcff6384 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp
@@ -13,7 +13,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
index 661027237f836..1c07e494fcc47 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
@@ -12,7 +12,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-#define SG_SZ 16
-
 #include "element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
index a2b8ef5aa8b57..7ad89965f5243 100644
--- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
+++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
@@ -21,8 +21,6 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 using bfloat16 = sycl::ext::oneapi::bfloat16;
 
-#define SG_SZ 16
-
 // 10x12 is not multiply the sg size, slicing implementation will have to insert
 // padding
 #define TM 10
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
index 78a6f815df19c..57c9a00d98fd4 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
@@ -16,7 +16,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
index 6500a34f48119..67fa811f2d764 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
@@ -16,7 +16,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index 798afde072dd3..fe87e9a911b7b 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
index 408a6087206ea..0eb13cf57347c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 // Sub-matrix N dimension
 static constexpr size_t SN = 16;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp
index 896cbef04cff0..69c9eb31a4b9c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp
@@ -15,7 +15,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_annotated_ptr_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp
index 82ad15285a4fa..d58677fa2c178 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_apply_bf16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
index 0c93876db2a15..abee7d7259f28 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
@@ -18,7 +18,6 @@
 #include "common.hpp"
 #include <cstddef>
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
index 7206cb165349b..d839f3db8f481 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -14,7 +14,7 @@
 
 #include "common.hpp"
 #include <cstddef>
-#define SG_SZ 16
+
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 5518d9cb08fbc..1800901e24111 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -20,7 +20,6 @@
 #include "common.hpp"
 #include <cstddef>
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index a393f3a2ad729..701c17741f576 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -17,7 +17,6 @@
 #include "common.hpp"
 #include <cstddef>
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
index d1410ac68276e..2222cbb605a15 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
index 80e1f310ce440..98ed155b297ad 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-#define SG_SZ 16
+
 static constexpr int TN = 16;
 
 #include "joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 9cd31a8c5178e..19d12915b4a95 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -20,7 +20,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
index 3e80168752545..0d592e04b606c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
@@ -12,6 +12,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-#define SG_SZ 16
 #include "joint_matrix_bfloat16_packedB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
index 7d114175dff13..354a71006e129 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "joint_matrix_colA_rowB_colC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp
index caea640677aa7..dee504c22e7f6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp
@@ -11,7 +11,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t SG_SZ = 16;
-
 #include "joint_matrix_down_convert_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
index ac09361a0799c..9281e47f572d2 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
index 33c00022a5a76..fb29cc2baaf74 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -20,7 +20,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
index 02813c6720deb..8dcddb841721d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_vnni_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
index 6195ee2935892..031c7753de425 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
@@ -16,7 +16,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 static constexpr size_t SN = 16;
 
 #include "joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
index 854d3ccc85dce..d11a6498ca7dd 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 constexpr size_t MATRIX_K = 1024 + 24;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp
index 30d9278e07157..7abea83c6d287 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp
@@ -13,6 +13,5 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 #include "joint_matrix_prefetch_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp
index 958bd94fe0cd3..77df6085bc09a 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp
@@ -16,8 +16,4 @@
 // transform. This is currently only available on AMX and XMX of PVC
 
 #include "common.hpp"
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 #include "joint_matrix_rowmajorA_rowmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp
index e487b8cdcb41d..2089e0185b0e0 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_ss_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp
index 72910c4ed5446..7a02d03b9d642 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_su_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp
index 6f34a4acbea61..922b79f356e78 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp
@@ -13,10 +13,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
index f98c8bd3c7b48..bd04b157cf667 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
index 212ac34a3a640..e1cf6cb6cf8bb 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 static constexpr size_t MATRIX_K = 1024 + 14;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp
index 409b589904847..f4237b995aad8 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_us_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp
index 59a47484a335c..a75d18b9e6967 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_uu_int8_impl.hpp"

From d26787405f0a3ee8334a1b0a8474c16897e86340 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 15 May 2024 15:21:26 -0700
Subject: [PATCH 04/42] Made tests independant of SG_SZ

---
 .../XMX8/joint_matrix_opt_kernel_feature.cpp  |  3 -
 .../test-e2e/Matrix/element_wise_abc_impl.hpp |  8 +-
 .../Matrix/element_wise_all_sizes_impl.hpp    | 54 ++++++-----
 .../Matrix/get_coord_float_matC_impl.hpp      | 15 ++--
 .../Matrix/get_coord_int8_matA_impl.hpp       | 73 ++++++++-------
 .../Matrix/joint_matrix_all_sizes_impl.hpp    | 57 ++++++------
 .../Matrix/joint_matrix_apply_bf16_impl.hpp   | 57 ++++++------
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   | 23 +++--
 .../joint_matrix_bfloat16_array_impl.hpp      | 15 ++--
 .../Matrix/joint_matrix_bfloat16_impl.hpp     | 15 ++--
 .../joint_matrix_bfloat16_packedB_impl.hpp    | 15 ++--
 .../Matrix/joint_matrix_down_convert_impl.hpp | 15 ++--
 .../Matrix/joint_matrix_half_impl.hpp         | 90 ++++++++++---------
 .../Matrix/joint_matrix_int8_vnni_impl.hpp    | 14 +--
 .../joint_matrix_opt_kernel_feature.cpp       |  3 -
 .../joint_matrix_opt_kernel_feature_impl.hpp  | 18 ++--
 .../Matrix/joint_matrix_ss_int8_impl.hpp      | 14 +--
 .../Matrix/joint_matrix_su_int8_impl.hpp      | 16 ++--
 .../Matrix/joint_matrix_tf32_impl.hpp         | 15 ++--
 .../Matrix/joint_matrix_transposeC_impl.hpp   | 89 +++++++++---------
 .../Matrix/joint_matrix_us_int8_impl.hpp      | 14 +--
 .../Matrix/joint_matrix_uu_int8_impl.hpp      | 16 ++--
 22 files changed, 357 insertions(+), 282 deletions(-)

diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
index 07a48bd44fccd..30b3522ad2442 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
@@ -7,7 +7,4 @@
 // incompatible on the current device
 
 #include "../common.hpp"
-
-constexpr size_t SN = 8;
-
 #include "../joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index 8c08bfad7a867..dea0cf882eaaf 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -15,7 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TM 8
 #define TK 32
-class add;
+// class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
@@ -28,15 +28,13 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
-  std::cout << "Artem: before get_sg_size()\n";
-  size_t sg_size = get_sg_size<add>(q);
-  std::cout << "Artem: after get_sg_size()\n";
+  size_t sg_size = get_sg_size<class add>(q);
   q.submit([&](handler &cgh) {
      accessor accC{bufC, cgh};
      accessor accA{bufA, cgh};
      accessor accB{bufB, cgh};
 
-     cgh.parallel_for(
+     cgh.parallel_for<class add>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
index 4020e8b84bbd2..5800ab9c62745 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
@@ -23,7 +23,7 @@ void assert_ops_ref(host_accessor<T, 2, access::mode::read_write> C,
     }
 }
 
-template <typename T, typename T1, size_t TM, size_t TK>
+template <typename T, typename T1, size_t TM, size_t TK, typename kernel_name>
 void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   static constexpr size_t M = TM * M_MULTIPLIER;
   static constexpr size_t K = 128;
@@ -32,7 +32,8 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   size_t NDRangeM = M / TM;
   size_t NDRangeK = K / TK;
   queue q;
-  nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ});
+  size_t sg_size = get_sg_size<kernel_name>(q);
+  nd_range<2> r({NDRangeM, NDRangeK * sg_size}, {1, 1 * sg_size});
   big_matrix<T, M, K> A((T *)&MatA);
 
   buffer<T, 2> bufA(A.get_data(), range<2>(M, K));
@@ -40,8 +41,12 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   q.submit([&](handler &cgh) {
      sycl::accessor accA{bufA, cgh, sycl::read_write};
 
-     cgh.parallel_for(
-         r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<kernel_name>(
+         r, [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+                [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
@@ -57,41 +62,42 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_a,
                accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * K + sg_starty / SG_SZ * TK,
+                   (sg_startx * TM) * K + sg_starty / sg_size * TK,
                K);
          }); // parallel for
    }).wait();
   assert_ops_ref<T, M, K>(bufA.get_host_access(), result);
 }
 
-template <typename Ta, size_t tM, size_t tK> void add_ref() {
+template <typename Ta, size_t tM, size_t tK, typename kernel_name>
+void add_ref() {
   if constexpr (std::is_same_v<Ta, bfloat16>) {
     // Tests whether 5 + 2 = 7 operation is successful.
-    matrix_verify_add<bfloat16, bfloat16, tM, tK>(bfloat16(5.0), bfloat16(2.0),
-                                                  bfloat16(7.0));
+    matrix_verify_add<bfloat16, bfloat16, tM, tK, kernel_name>(
+        bfloat16(5.0), bfloat16(2.0), bfloat16(7.0));
   }
   if constexpr (std::is_same_v<Ta, int8_t>) {
-    matrix_verify_add<int8_t, int, tM, tK>(5 /*val1*/, 2 /*val2*/,
-                                           7 /*result*/);
+    matrix_verify_add<int8_t, int, tM, tK, kernel_name>(5 /*val1*/, 2 /*val2*/,
+                                                        7 /*result*/);
   }
 }
 
 int main() {
-  add_ref<bfloat16, 1 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 2 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 3 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 4 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 5 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 6 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 7 /*TM*/, 16 /*TK*/>();
+  add_ref<bfloat16, 1 /*TM*/, 16 /*TK*/, class test_bfloat16_1>();
+  add_ref<bfloat16, 2 /*TM*/, 16 /*TK*/, class test_bfloat16_2>();
+  add_ref<bfloat16, 3 /*TM*/, 16 /*TK*/, class test_bfloat16_3>();
+  add_ref<bfloat16, 4 /*TM*/, 16 /*TK*/, class test_bfloat16_4>();
+  add_ref<bfloat16, 5 /*TM*/, 16 /*TK*/, class test_bfloat16_5>();
+  add_ref<bfloat16, 6 /*TM*/, 16 /*TK*/, class test_bfloat16_6>();
+  add_ref<bfloat16, 7 /*TM*/, 16 /*TK*/, class test_bfloat16_7>();
 
-  add_ref<int8_t, 1 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 2 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 3 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 4 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 5 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 6 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 7 /*TM*/, 32 /*TK*/>();
+  add_ref<int8_t, 1 /*TM*/, 32 /*TK*/, class test_int8_1>();
+  add_ref<int8_t, 2 /*TM*/, 32 /*TK*/, class test_int8_2>();
+  add_ref<int8_t, 3 /*TM*/, 32 /*TK*/, class test_int8_3>();
+  add_ref<int8_t, 4 /*TM*/, 32 /*TK*/, class test_int8_4>();
+  add_ref<int8_t, 5 /*TM*/, 32 /*TK*/, class test_int8_5>();
+  add_ref<int8_t, 6 /*TM*/, 32 /*TK*/, class test_int8_6>();
+  add_ref<int8_t, 7 /*TM*/, 32 /*TK*/, class test_int8_7>();
 
   std::cout << "Passed\n";
 }
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
index bedc91bdc39d4..b424a01a7c6a6 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
@@ -28,13 +28,18 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
   buffer<float> sum_rows_v(sum_rows, M);
 
   queue q;
+  size_t sg_size = get_sg_size<class add>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto v = sum_rows_v.get_access<access::mode::read_write>(cgh);
 
-     cgh.parallel_for(
-         nd_range<2>({M / TM, N / TN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class add>(
+         nd_range<2>({M / TM, N / TN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+           {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -49,7 +54,7 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
 
            float sum_local_rows[M] = {0};
@@ -62,7 +67,7 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
              sum_local_rows[i] =
                  reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
              // only Groups leader perform the global reduction
-             if (global_idy % SG_SZ == 0) {
+             if (global_idy % sg_size == 0) {
                sycl::atomic_ref<float, sycl::memory_order::relaxed,
                                 sycl::memory_scope::device>
                    aref(v[i]);
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
index afda0f90a6e37..6f57ab5b4e63c 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
@@ -72,45 +72,54 @@ W0 --> 0 0 1 1 2 2 3 3 .... 7 7
 // clang-format on
 
 template <typename T, size_t M, size_t K>
-void matrix_sum_rows(queue q, big_matrix<T, M, K> &A, nd_range<2> &r) {
+void matrix_sum_rows(big_matrix<T, M, K> &A) {
   buffer<int8_t, 2> bufA(A.get_data(), range<2>(M, K));
 
   // size of vector is equal to number of rows in big matrix
   int sum_rows[M] = {0};
   buffer<int> sum_rows_v(sum_rows, M);
+  queue q;
+  size_t sg_size = get_sg_size<class add>(q);
+  nd_range<2> r({M / TM, K / TK * sg_size}, {1, 1 * sg_size});
   q.submit([&](handler &cgh) {
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto v = sum_rows_v.get_access<access::mode::atomic>(cgh);
 
-     cgh.parallel_for(r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(
-                             SG_SZ)]] {
-       const auto global_idx = spmd_item.get_global_id(0);
-       const auto global_idy = spmd_item.get_global_id(1);
-       const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-       const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-       sycl::sub_group sg = spmd_item.get_sub_group();
-       joint_matrix<sub_group, int8_t, use::a, TM, TK, layout::row_major> sub_a;
-       joint_matrix_load(sg, sub_a,
-                         accA.template get_multi_ptr<access::decorated::no>() +
-                             (sg_startx * TM * K) + sg_starty / SG_SZ * TK,
-                         K);
-
-       int32_t sum_local_rows[M] = {0};
-
-       ext::intel::experimental::matrix::joint_matrix_apply(
-           sg, sub_a, [&](int8_t &x, size_t row, size_t col) {
-             sum_local_rows[row + global_idx * TM] += x;
-           });
-       for (int i = 0; i < M; ++i) {
-         sum_local_rows[i] =
-             reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
-
-         // only Groups leader performs the global reduction
-         if (global_idy % SG_SZ == 0)
-           atomic_fetch_add(v[i], sum_local_rows[i]);
-       }
-     }); // parallel for
+     cgh.parallel_for<class add>(
+         r, [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+                [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sycl::sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, int8_t, use::a, TM, TK, layout::row_major>
+               sub_a;
+           joint_matrix_load(
+               sg, sub_a,
+               accA.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM * K) + sg_starty / sg_size * TK,
+               K);
+
+           int32_t sum_local_rows[M] = {0};
+
+           ext::intel::experimental::matrix::joint_matrix_apply(
+               sg, sub_a, [&](int8_t &x, size_t row, size_t col) {
+                 sum_local_rows[row + global_idx * TM] += x;
+               });
+           for (int i = 0; i < M; ++i) {
+             sum_local_rows[i] =
+                 reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
+
+             // only Groups leader performs the global reduction
+             if (global_idy % sg_size == 0)
+               atomic_fetch_add(v[i], sum_local_rows[i]);
+           }
+         }); // parallel for
    }).wait();
   sum_rows_ref<T, M, K>(bufA.get_host_access(), sum_rows_v.get_host_access());
 }
@@ -124,8 +133,6 @@ int main() {
 
   size_t NDRangeM = MATRIX_M / TM;
   size_t NDRangeK = MATRIX_K / TK;
-  queue q;
-  nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ});
 
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
@@ -133,7 +140,7 @@ int main() {
     }
   }
 
-  matrix_sum_rows<int8_t, MATRIX_M, MATRIX_K>(q, MA, r);
+  matrix_sum_rows<int8_t, MATRIX_M, MATRIX_K>(MA);
   std::cout << "Passed\n";
   return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
index edfcfe1d2e979..8e9880235c2b2 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
@@ -9,7 +9,7 @@
 static constexpr size_t M_MULTIPLIER = 16;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
-          int vnniFactor, size_t TM, size_t TN, size_t TK>
+          int vnniFactor, size_t TM, size_t TN, size_t TK, typename kernel_name>
 void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                      big_matrix<T2, K / vnniFactor, N * vnniFactor> &B) {
   size_t NDRangeM = M / TM;
@@ -19,15 +19,18 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
   q.submit([&](handler &cgh) {
      sycl::accessor accC{bufC, cgh, sycl::read_write};
      sycl::accessor accA{bufA, cgh, sycl::read_only};
      sycl::accessor accB{bufB, cgh, sycl::read_only};
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+     cgh.parallel_for<kernel_name>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -47,7 +50,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -59,21 +62,21 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
                      (k * TK / vnniFactor) * (N * vnniFactor) +
-                     sg_starty / SG_SZ * TN * vnniFactor,
+                     sg_starty / sg_size * TN * vnniFactor,
                  N * vnniFactor);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
 }
 
 template <typename Ta, typename Tc, int vnni_factor, size_t tM, size_t tN,
-          size_t tK>
+          size_t tK, typename kernel_name>
 int init_and_multiply() {
   static constexpr size_t MATRIX_M = tM * M_MULTIPLIER;
   static constexpr size_t MATRIX_N = 128;
@@ -100,7 +103,7 @@ int init_and_multiply() {
       (Ta *)&Bvnni);
 
   matrix_multiply<Tc, Ta, MATRIX_M, MATRIX_N, MATRIX_K, vnni_factor, tM, tN,
-                  tK>(MC, MA, MBvnni);
+                  tK, kernel_name>(MC, MA, MBvnni);
   matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K);
 
   bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D);
@@ -110,23 +113,23 @@ int init_and_multiply() {
 
 int main() {
   int errors = 0;
-  errors += init_and_multiply<bfloat16, float, 2, 1, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 2, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 3, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 4, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 5, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 6, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 7, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 8, SN, 16>();
-
-  errors += init_and_multiply<int8_t, int32_t, 4, 1, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 2, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 3, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 4, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 5, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 6, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 7, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 8, SN, 32>();
+  errors += init_and_multiply<bfloat16, float, 2, 1, SN, 16, class bf16_1>();
+  errors += init_and_multiply<bfloat16, float, 2, 2, SN, 16, class bf16_2>();
+  errors += init_and_multiply<bfloat16, float, 2, 3, SN, 16, class bf16_3>();
+  errors += init_and_multiply<bfloat16, float, 2, 4, SN, 16, class bf16_4>();
+  errors += init_and_multiply<bfloat16, float, 2, 5, SN, 16, class bf16_5>();
+  errors += init_and_multiply<bfloat16, float, 2, 6, SN, 16, class bf16_6>();
+  errors += init_and_multiply<bfloat16, float, 2, 7, SN, 16, class bf16_7>();
+  errors += init_and_multiply<bfloat16, float, 2, 8, SN, 16, class bf16_8>();
+
+  errors += init_and_multiply<int8_t, int32_t, 4, 1, SN, 32, class int8_1>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 2, SN, 32, class int8_2>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 3, SN, 32, class int8_3>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 4, SN, 32, class int8_4>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 5, SN, 32, class int8_5>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 6, SN, 32, class int8_6>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 7, SN, 32, class int8_7>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 8, SN, 32, class int8_8>();
 
   return errors;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
index 1ec089d0f53f2..796bdce8d0752 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
@@ -13,35 +13,41 @@ template <typename T> struct apply_add {
   void operator()(T &x) const { x = x + bfloat16(2); }
 };
 
-template <typename T, size_t M, size_t N, typename F>
-void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
-                       const float ref, F &&lambda) {
+template <typename T, size_t M, size_t N, typename kernel_name, typename F>
+void matrix_verify_add(big_matrix<T, M, N> &A, const float ref, F &&lambda) {
   buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));
 
+  queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
+  nd_range<2> r({M / TM, N / TN * sg_size}, {1, 1 * sg_size});
+
   q.submit([&](handler &cgh) {
      accessor accA{bufA, cgh};
 
-     cgh.parallel_for(r, [accA, lambda](
-                             nd_item<2> spmd_item) [[intel::reqd_sub_group_size(
-                             SG_SZ)]] {
-       const auto global_idx = spmd_item.get_global_id(0);
-       const auto global_idy = spmd_item.get_global_id(1);
-       const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-       const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+     cgh.parallel_for<kernel_name>(
+         r, [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+                [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
-       sub_group sg = spmd_item.get_sub_group();
-       joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;
 
-       joint_matrix_fill(sg, sub_a, bfloat16(5.0));
+           joint_matrix_fill(sg, sub_a, bfloat16(5.0));
 
-       joint_matrix_apply(sg, sub_a, lambda);
+           joint_matrix_apply(sg, sub_a, lambda);
 
-       ext::intel::experimental::matrix::joint_matrix_store(
-           sg, sub_a,
-           accA.template get_multi_ptr<access::decorated::no>() +
-               (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-           N);
-     }); // parallel for
+           ext::intel::experimental::matrix::joint_matrix_store(
+               sg, sub_a,
+               accA.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N);
+         }); // parallel for
    }).wait();
   // Check if the results are correct
   {
@@ -61,14 +67,9 @@ int main() {
 
   big_matrix<bfloat16, MATRIX_M, MATRIX_N> MA((bfloat16 *)&A);
 
-  size_t NDRangeM = MATRIX_M / TM;
-  size_t NDRangeN = MATRIX_N / TN;
-  queue q;
-  nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ});
-
-  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N>(
-      q, MA, r, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); });
-  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 7.0,
+  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class add>(
+      MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); });
+  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class func_add>(MA, 7.0,
                                                   apply_add<bfloat16>());
   std::cout << "Passed\n";
   return 0;
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 1c29f866f134c..36cfb5ea1f069 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -68,8 +68,9 @@ static constexpr void manually_unroll_loop(F &&f) {
 
 template <unsigned int rowsA, unsigned int colsA, unsigned int rowsB,
           unsigned int colsB, unsigned int vnniFactor, typename TOperand,
-          typename TResult, unsigned int sgSize = SG_SZ>
+          typename TResult>
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
+  size_t sgSize = get_sg_size<class MatMul>(q);
   range<2> global{rowsA / MCACHE1, (colsB / NCACHE1) * sgSize};
   range<2> cachelocal{MCACHE2 / MCACHE1, NCACHE2 / NCACHE1 * sgSize};
 
@@ -82,12 +83,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
   std::chrono::high_resolution_clock::time_point start =
       std::chrono::high_resolution_clock::now();
 
-  auto mk = q.submit([&](handler &h) {
-    h.parallel_for( // cache layer#1
+  static auto work = [&](handler &h) {
+    h.parallel_for<class MatMul>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
         // loop localrange
-        [=](nd_item<2> it) [[intel::reqd_sub_group_size(sgSize)]] {
+        [=](nd_item<2> it)
+#ifdef SG_SZ
+            [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+        {
           auto pA =
               address_space_cast<sycl::access::address_space::global_space,
                                  sycl::access::decorated::no>(A);
@@ -243,8 +248,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
             });     // for k1
 #else
                 } // n
-              }   // m
-            }     // k1
+              } // m
+            } // k1
 #endif
           } // for k2
 #ifdef MANUAL_UNROLL
@@ -267,10 +272,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           });   // m
 #else
             } // n
-          }   // m
+          } // m
 #endif
         }); // parallel_for
-  });       // queue.submit
+  }; // queue.submit
+  q.submit(work);
+
   if (i == testIterations - 1)
     q.wait();
   std::chrono::duration<double, std::milli> duration =
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
index 5be3c485312c2..bc317ffc27d31 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
@@ -23,14 +23,19 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class imatrix>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+          {
            // Matrix API has to be accessed by all the workitems in a
            // subgroup. These functions will be called once by the subgroup.
            // No code divergence between the workitems.
@@ -57,7 +62,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
                  N * 2);
 
              for (int i = 0; i < JM_ARRAY_SZ; ++i) {
@@ -75,7 +80,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                  sg, sub_c[i],
                  accC.template get_multi_ptr<access::decorated::no>() +
                      (sg_startx * TM * JM_ARRAY_SZ + TM * i) * N +
-                     sg_starty / SG_SZ * TN,
+                     sg_starty / sg_size * TN,
                  N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
index 8cb6c120d8a34..068506cc63724 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
@@ -19,15 +19,18 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -49,7 +52,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) { //
              joint_matrix_load(
@@ -60,14 +63,14 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
                  N * 2);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
index 91156c3fcc128..36ce0f81f0c63 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
@@ -17,15 +17,18 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<kernel_name>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -47,7 +50,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) { //
              joint_matrix_load(
@@ -59,14 +62,14 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
                  N * 2);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
index 3f02be1358844..54861eb3b1d3b 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
@@ -23,13 +23,18 @@ void matrix_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class copy>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::write>(cgh);
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class copy>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -46,13 +51,13 @@ void matrix_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            joint_matrix_copy(sg, sub_c, sub_a);
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_a,
                accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp
index aad8aeaa5c602..53b4ca7b97412 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp
@@ -27,56 +27,60 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<float, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class mult>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
-     cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
-             [[intel::reqd_sub_group_size(SG_SZ)]] {
-               // The submatrix API has to be accessed by all the workitems in a
-               // subgroup these functions will be called once by the subgroup
-               // no code divergence between the workitems
-               const auto global_idx = spmd_item.get_global_id(0);
-               const auto global_idy = spmd_item.get_global_id(1);
-               const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-               const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+     cgh.parallel_for<class mult>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           // The submatrix API has to be accessed by all the workitems in a
+           // subgroup these functions will be called once by the subgroup
+           // no code divergence between the workitems
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
-               sub_group sg = spmd_item.get_sub_group();
-               joint_matrix<sub_group, half, use::a, TM, TK, layout::row_major>
-                   sub_a;
-               // For B, we assume B has been already VNNIed.
-               joint_matrix<sub_group, half, use::b, TK, TN,
-                            layout::ext_intel_packed>
-                   sub_b;
-               joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, half, use::a, TM, TK, layout::row_major>
+               sub_a;
+           // For B, we assume B has been already VNNIed.
+           joint_matrix<sub_group, half, use::b, TK, TN,
+                        layout::ext_intel_packed>
+               sub_b;
+           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
 
-               joint_matrix_load(
-                   sg, sub_c,
-                   accC.template get_multi_ptr<access::decorated::no>() +
-                       (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-                   N, layout::row_major);
-               for (int k = 0; k < K / TK; k += 1) {
-                 joint_matrix_load(
-                     sg, sub_a,
-                     accA.template get_multi_ptr<access::decorated::no>() +
-                         (sg_startx * TM) * K + k * TK,
-                     K);
-                 joint_matrix_load(
-                     sg, sub_b,
-                     accB.template get_multi_ptr<access::decorated::no>() +
-                         (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
-                     N * 2);
-                 joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-               }
-               joint_matrix_store(
-                   sg, sub_c,
-                   accC.template get_multi_ptr<access::decorated::no>() +
-                       (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-                   N, layout::row_major);
-             }); // parallel for
+           joint_matrix_load(
+               sg, sub_c,
+               accC.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
+           for (int k = 0; k < K / TK; k += 1) {
+             joint_matrix_load(
+                 sg, sub_a,
+                 accA.template get_multi_ptr<access::decorated::no>() +
+                     (sg_startx * TM) * K + k * TK,
+                 K);
+             joint_matrix_load(
+                 sg, sub_b,
+                 accB.template get_multi_ptr<access::decorated::no>() +
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
+                 N * 2);
+             joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+           }
+           joint_matrix_store(
+               sg, sub_c,
+               accC.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
+         }); // parallel for
    }).wait();
 }
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp
index 96993082d8cb5..625b41f3037b8 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp
@@ -26,15 +26,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -61,14 +65,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK) * N + sg_starty / SG_SZ * TN,
+                     (k * TK) * N + sg_starty / sg_size * TN,
                  N);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
index 031c7753de425..5acc54a412096 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
@@ -15,7 +15,4 @@
 // incompatible on the current device
 
 #include "common.hpp"
-
-static constexpr size_t SN = 16;
-
 #include "joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
index a0b468120ebd3..7aba5911c8386 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
@@ -22,14 +22,19 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      sycl::accessor accC{bufC, cgh, sycl::read_write};
      sycl::accessor accA{bufA, cgh, sycl::read_only};
      sycl::accessor accB{bufB, cgh, sycl::read_only};
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class imatrix>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
@@ -44,7 +49,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -56,7 +61,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
                      (k * TK / vnniFactor) * (N * vnniFactor) +
-                     sg_starty / SG_SZ * TN * vnniFactor,
+                     sg_starty / sg_size * TN * vnniFactor,
                  N * vnniFactor);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
@@ -95,8 +100,9 @@ int main() {
     init_and_multiply<bfloat16, float, 2, 1, 500,
                       16>(); // 500 is not correct size
   } catch (const sycl::exception &e) {
-    if (e.code() == errc::kernel_not_supported)
+    if (e.code() == errc::invalid)
       return 0;
+    throw;
   }
 
   return 1;
diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp
index ef67ebbd951f3..3e00c667c2505 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp
@@ -28,15 +28,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -64,14 +68,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp
index 3973a7b516bc8..f8feb25d99229 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp
@@ -28,15 +28,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -57,7 +61,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -68,14 +72,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
index 2b2fae59cd94d..536fa84581f27 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
@@ -27,15 +27,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The matrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -56,7 +59,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K; k += TK) {
              joint_matrix_load(
@@ -67,7 +70,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k) * (N) + sg_starty / SG_SZ * TN,
+                     (k) * (N) + sg_starty / sg_size * TN,
                  N);
              // If no rounding to tf32 function is called, joint_matrix_mad
              // function will work on truncated floats.
@@ -81,7 +84,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index 624cfdb256e7d..faea43b062477 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -11,6 +11,9 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
+template <size_t TM>
+class LS;
+
 template <size_t TM, size_t TN, typename T1, size_t NUM_ROWS, size_t NUM_COLS>
 void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
                            queue q) {
@@ -22,47 +25,51 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
 
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
-
-  q.submit([&](handler &cgh) {
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
-           auto p_input =
-               address_space_cast<sycl::access::address_space::global_space,
-                                  sycl::access::decorated::no>(input);
-
-           auto p_out_col_major =
-               address_space_cast<sycl::access::address_space::global_space,
-                                  sycl::access::decorated::no>(out_col_major);
-           auto p_out_row_major =
-               address_space_cast<sycl::access::address_space::global_space,
-                                  sycl::access::decorated::no>(out_row_major);
-
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
-
-           auto row_major_offset =
-               (sg_startx * TM) * N + (sg_starty / SG_SZ * TN);
-           auto col_major_offset =
-               (sg_startx * TM) + (sg_starty / SG_SZ * TN) * M;
-
-           joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
-                             layout::col_major);
-
-           joint_matrix_store(sg, sub_matrix,
-                              p_out_col_major + row_major_offset, N,
-                              layout::row_major);
-
-           joint_matrix_store(sg, sub_matrix,
-                              p_out_row_major + col_major_offset, M,
-                              layout::col_major);
-         }); // parallel for
-   }).wait();
+  size_t sg_size = get_sg_size<class LS<TM>>(q);
+
+  static auto work = [&](handler &cgh) {
+    cgh.parallel_for<class LS<TM>>(
+        nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+        [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+            [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+        {
+          auto p_input =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(input);
+
+          auto p_out_col_major =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(out_col_major);
+          auto p_out_row_major =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(out_row_major);
+
+          const auto global_idx = spmd_item.get_global_id(0);
+          const auto global_idy = spmd_item.get_global_id(1);
+          const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+          const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+          sub_group sg = spmd_item.get_sub_group();
+          joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
+
+          auto row_major_offset =
+              (sg_startx * TM) * N + (sg_starty / sg_size * TN);
+          auto col_major_offset =
+              (sg_startx * TM) + (sg_starty / sg_size * TN) * M;
+
+          joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
+                            layout::col_major);
+
+          joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset,
+                             N, layout::row_major);
+
+          joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset,
+                             M, layout::col_major);
+        }); // parallel for
+  };
+  q.submit(work).wait();
 }
 
 template <size_t TM> void run_matrix_test() {
diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp
index 5441df5fe2542..db8eda82ef239 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp
@@ -28,16 +28,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
-
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -59,7 +61,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -71,14 +73,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp
index 4dcb60f4330fc..7e7edb700debb 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp
@@ -28,15 +28,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -57,7 +61,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -69,14 +73,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();

From 0a71b8844cefb148be74961576b86d57ff6f062e Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 15 May 2024 15:37:33 -0700
Subject: [PATCH 05/42] clang-format

---
 sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp          | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp        | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp       | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp   | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp         | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp     | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp             | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp       | 3 +--
 9 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
index b424a01a7c6a6..32ceaf8c730a0 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
@@ -39,7 +39,7 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-           {
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
index 8e9880235c2b2..b48e46e18de3d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
@@ -102,8 +102,8 @@ int init_and_multiply() {
   big_matrix<Ta, MATRIX_K / vnni_factor, MATRIX_N * vnni_factor> MBvnni(
       (Ta *)&Bvnni);
 
-  matrix_multiply<Tc, Ta, MATRIX_M, MATRIX_N, MATRIX_K, vnni_factor, tM, tN,
-                  tK, kernel_name>(MC, MA, MBvnni);
+  matrix_multiply<Tc, Ta, MATRIX_M, MATRIX_N, MATRIX_K, vnni_factor, tM, tN, tK,
+                  kernel_name>(MC, MA, MBvnni);
   matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K);
 
   bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D);
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
index 796bdce8d0752..3d3c6304952e5 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
@@ -69,8 +69,8 @@ int main() {
 
   matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class add>(
       MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); });
-  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class func_add>(MA, 7.0,
-                                                  apply_add<bfloat16>());
+  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class func_add>(
+      MA, 7.0, apply_add<bfloat16>());
   std::cout << "Passed\n";
   return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
index bc317ffc27d31..9aefc370bd0c6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
@@ -31,11 +31,11 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-          {
+         {
            // Matrix API has to be accessed by all the workitems in a
            // subgroup. These functions will be called once by the subgroup.
            // No code divergence between the workitems.
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
index 068506cc63724..aef22d35f7d17 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
@@ -27,7 +27,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
index 36ce0f81f0c63..6a7182c41985d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
@@ -25,7 +25,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
 
      cgh.parallel_for<kernel_name>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
index 54861eb3b1d3b..8ac48511c7e10 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
@@ -30,11 +30,11 @@ void matrix_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
 
      cgh.parallel_for<class copy>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-          {
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
index 536fa84581f27..69991884c0710 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
@@ -35,7 +35,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index faea43b062477..5de94de6a18ba 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -11,8 +11,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-template <size_t TM>
-class LS;
+template <size_t TM> class LS;
 
 template <size_t TM, size_t TN, typename T1, size_t NUM_ROWS, size_t NUM_COLS>
 void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,

From 5621804564637714f3e28356184a122279ec582c Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Thu, 16 May 2024 08:15:33 -0700
Subject: [PATCH 06/42] Clean up nits

---
 sycl/test-e2e/Matrix/element_wise_abc_impl.hpp                | 1 -
 sycl/test-e2e/Matrix/get_coord_float_matC.cpp                 | 4 ----
 sycl/test-e2e/Matrix/get_coord_int8_matA.cpp                  | 4 ----
 sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp             | 3 ---
 sycl/test-e2e/Matrix/get_coord_int8_matB.cpp                  | 4 ----
 sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp               | 3 ---
 sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp                | 3 ---
 .../Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp      | 3 ---
 sycl/test-e2e/Matrix/joint_matrix_half.cpp                    | 3 ---
 .../test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp | 3 ---
 sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp               | 3 ---
 11 files changed, 34 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index dea0cf882eaaf..655fa90275f40 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -15,7 +15,6 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TM 8
 #define TK 32
-// class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
index 57c9a00d98fd4..af7e8e1745781 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
@@ -11,10 +11,6 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t TN = 16;
 
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
index 67fa811f2d764..d29217577443e 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
@@ -11,10 +11,6 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t TN = 16;
 
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
index 6f57ab5b4e63c..3f39ebf731801 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
@@ -131,9 +131,6 @@ int main() {
 
   big_matrix<int8_t, MATRIX_M, MATRIX_K> MA((int8_t *)&A);
 
-  size_t NDRangeM = MATRIX_M / TM;
-  size_t NDRangeK = MATRIX_K / TK;
-
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
       A[i][j] = i + j;
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index fe87e9a911b7b..be35a4e672c30 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -12,10 +12,6 @@
 // XFAIL: *
 
 #include "common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t TN = 16;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
index 0eb13cf57347c..1478914d1e44f 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
@@ -12,9 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 // Sub-matrix N dimension
 static constexpr size_t SN = 16;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
index 2222cbb605a15..1985bcb6a4fb9 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
@@ -12,9 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 19d12915b4a95..21d5f1239cd8d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -17,9 +17,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
index 9281e47f572d2..0bacfa93792d6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
@@ -14,9 +14,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
index fb29cc2baaf74..37769a41f7003 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -17,9 +17,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
index 8dcddb841721d..f592057ce94d5 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
@@ -12,9 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_vnni_impl.hpp"

From 23a7afcd215b08be355c220a11d7ee8c4b93135b Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Thu, 16 May 2024 12:22:22 -0700
Subject: [PATCH 07/42] Managed to remove the static code

---
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   |  5 +-
 .../Matrix/joint_matrix_transposeC_impl.hpp   | 83 ++++++++++---------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 36cfb5ea1f069..51382467c0459 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -83,7 +83,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
   std::chrono::high_resolution_clock::time_point start =
       std::chrono::high_resolution_clock::now();
 
-  static auto work = [&](handler &h) {
+  q.submit([&](handler &h) {
     h.parallel_for<class MatMul>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
@@ -275,8 +275,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           } // m
 #endif
         }); // parallel_for
-  }; // queue.submit
-  q.submit(work);
+  });       // queue.submit
 
   if (i == testIterations - 1)
     q.wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index 5de94de6a18ba..24ba24a264f0d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -26,49 +26,50 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
   size_t NDRangeN = N / TN;
   size_t sg_size = get_sg_size<class LS<TM>>(q);
 
-  static auto work = [&](handler &cgh) {
-    cgh.parallel_for<class LS<TM>>(
-        nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-        [=](nd_item<2> spmd_item)
+  q.submit([&](handler &cgh) {
+     cgh.parallel_for<class LS<TM>>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
-            [[intel::reqd_sub_group_size(SG_SZ)]]
+             [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-        {
-          auto p_input =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(input);
-
-          auto p_out_col_major =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(out_col_major);
-          auto p_out_row_major =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(out_row_major);
-
-          const auto global_idx = spmd_item.get_global_id(0);
-          const auto global_idy = spmd_item.get_global_id(1);
-          const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-          const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-          sub_group sg = spmd_item.get_sub_group();
-          joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
-
-          auto row_major_offset =
-              (sg_startx * TM) * N + (sg_starty / sg_size * TN);
-          auto col_major_offset =
-              (sg_startx * TM) + (sg_starty / sg_size * TN) * M;
-
-          joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
-                            layout::col_major);
-
-          joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset,
-                             N, layout::row_major);
-
-          joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset,
-                             M, layout::col_major);
-        }); // parallel for
-  };
-  q.submit(work).wait();
+         {
+           auto p_input =
+               address_space_cast<sycl::access::address_space::global_space,
+                                  sycl::access::decorated::no>(input);
+
+           auto p_out_col_major =
+               address_space_cast<sycl::access::address_space::global_space,
+                                  sycl::access::decorated::no>(out_col_major);
+           auto p_out_row_major =
+               address_space_cast<sycl::access::address_space::global_space,
+                                  sycl::access::decorated::no>(out_row_major);
+
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
+
+           auto row_major_offset =
+               (sg_startx * TM) * N + (sg_starty / sg_size * TN);
+           auto col_major_offset =
+               (sg_startx * TM) + (sg_starty / sg_size * TN) * M;
+
+           joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
+                             layout::col_major);
+
+           joint_matrix_store(sg, sub_matrix,
+                              p_out_col_major + row_major_offset, N,
+                              layout::row_major);
+
+           joint_matrix_store(sg, sub_matrix,
+                              p_out_row_major + col_major_offset, M,
+                              layout::col_major);
+         }); // parallel for
+   }).wait();
 }
 
 template <size_t TM> void run_matrix_test() {

From a3c310b7994b4a507a218aee0eb21404d066fc80 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 08:03:57 -0700
Subject: [PATCH 08/42] Pass: elemwise_irreg_size_ops_bf16.cpp

---
 .../Matrix/elemwise_irreg_size_ops_bf16.cpp   | 36 +++++--------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
index 7ad89965f5243..4d453fe35da5c 100644
--- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
+++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
@@ -16,10 +16,7 @@
 
 #include <iostream>
 #include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::bfloat16;
+#include "common.hpp"
 
 // 10x12 is not multiply the sg size, slicing implementation will have to insert
 // padding
@@ -27,16 +24,6 @@ using bfloat16 = sycl::ext::oneapi::bfloat16;
 #define TN 12
 #define TK 16
 
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
-public:
-  T *mat;
-
-public:
-  T *get_data() { return mat; }
-  void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {}
-};
-
 template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
           size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
           size_t NUM_COLS_C>
@@ -55,16 +42,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
-
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -85,7 +74,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K; k += TK) {
              joint_matrix_load(
@@ -97,7 +86,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k) * (N) + sg_starty / SG_SZ * TN * 2,
+                     (k) * (N) + sg_starty / sg_size * TN * 2,
                  N * 2);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
@@ -105,7 +94,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
@@ -119,13 +108,6 @@ bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
 float C[MATRIX_M][MATRIX_N];
 float D[MATRIX_M][MATRIX_N];
 
-float make_fp32(bfloat16 x) {
-  unsigned int y = *((int *)&x);
-  y = y << 16;
-  float *res = reinterpret_cast<float *>(&y);
-  return *res;
-}
-
 void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N,
                          int K) {
   for (int m = 0; m < M; m++)

From 22da1c2a6d2c577d7aa94e8e41a037c6cbf6fcef Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 08:15:04 -0700
Subject: [PATCH 09/42] Pass: joint_matrix_annotated_ptr

---
 .../joint_matrix_annotated_ptr_impl.hpp       | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp
index 38e331bb04181..2eeba80572608 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp
@@ -11,16 +11,22 @@
 #define TM 8
 #define TK 16
 
+template <unsigned int vnniFactor>
+class mult;
+
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           unsigned int vnniFactor>
 void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) {
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
+  size_t sg_size = get_sg_size<mult<vnniFactor>>(q);
   q.submit([&](handler &cgh) {
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+     cgh.parallel_for<mult<vnniFactor>>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
@@ -53,20 +59,20 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) {
                    syclintelex::cache_control<syclintelex::cache_mode::cached,
                                               syclex::cache_level::L2>>}};
            joint_matrix_load(
-               sg, sub_c, C_ptr + (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+               sg, sub_c, C_ptr + (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(sg, sub_a, A_ptr + (sg_startx * TM) * K + k * TK,
                                K);
              if constexpr (vnniFactor == 0) {
                joint_matrix_load(
-                   sg, sub_b, B_ptr + (k * TK) * N + sg_starty / SG_SZ * TN, N);
+                   sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN, N);
                joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
              } else {
                joint_matrix_load(sg, sub_bp,
                                  B_ptr +
                                      (k * TK / vnniFactor) * (N * vnniFactor) +
-                                     sg_starty / SG_SZ * TN * vnniFactor,
+                                     sg_starty / sg_size * TN * vnniFactor,
                                  N * vnniFactor);
 
                joint_matrix_mad(sg, sub_c, sub_a, sub_bp, sub_c);
@@ -79,7 +85,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) {
                                               syclex::cache_level::L2>>}};
            joint_matrix_store(sg, sub_c,
                               C_w_ptr + (sg_startx * TM) * N +
-                                  sg_starty / SG_SZ * TN,
+                                  sg_starty / sg_size * TN,
                               N, layout::row_major);
          }); // parallel for
    }).wait();

From 850f30b7177f46fb7f2ad9de3915616fd67c5c35 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 08:17:56 -0700
Subject: [PATCH 10/42] Pass: joint_matrix_bfloat16_colmajorA_colmajorB

---
 ...t_matrix_bfloat16_colmajorA_colmajorB_impl.hpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp
index f7a78a1c8da87..6771795c70a0b 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp
@@ -19,15 +19,18 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -48,7 +51,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -59,14 +62,14 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (sg_starty / SG_SZ * TN) * K + k * TK,
+                     (sg_starty / sg_size * TN) * K + k * TK,
                  K);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();

From e7fcb5c6cf36e59447343b7ac833adffac62c45a Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 08:21:23 -0700
Subject: [PATCH 11/42] Pass: joint_matrix_int8_colmajorA_colmajorB

---
 .../joint_matrix_int8_colmajorA_colmajorB_impl.hpp  | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp
index 5123003769465..1390f8225406c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp
@@ -25,15 +25,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -60,14 +63,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (sg_starty / SG_SZ * TN) * K + k * TK,
+                     (sg_starty / sg_size * TN) * K + k * TK,
                  K);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();

From 596aaffaf0e5dec1c79e72bf3da4099c1a6238ee Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 08:27:49 -0700
Subject: [PATCH 12/42] Pass: joint_matrix_prefetch

---
 .../Matrix/joint_matrix_prefetch_impl.hpp     | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
index e4111526b4696..a2c5864fc0f14 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
@@ -11,10 +11,13 @@
 #define TM 8
 #define TK 16
 
+template <unsigned int vnniFactor>
+class mult;
+
 template <typename T1, typename T2, typename T, size_t M, size_t N, size_t K,
           layout B_layout, unsigned int vnniFactor>
 void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty,
-                            T1 *A, T2 *B, T *C) {
+                            size_t sg_size, T1 *A, T2 *B, T *C) {
   auto pA = address_space_cast<sycl::access::address_space::global_space,
                                access::decorated::no>(A);
   auto pB = address_space_cast<sycl::access::address_space::global_space,
@@ -29,22 +32,22 @@ void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty,
                                 layout::row_major,
                                 syclex::properties{syclex::prefetch_hint_L1});
   joint_matrix_prefetch<TK / vnniFactor, TN * vnniFactor>(
-      sg, B + sg_starty / SG_SZ * TN * vnniFactor, N * vnniFactor, B_layout,
+      sg, B + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor, B_layout,
 
       syclex::properties{syclex::prefetch_hint_L1});
   joint_matrix_prefetch<TM, TN>(
-      sg, C + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N,
+      sg, C + (sg_startx * TM) * N + sg_starty / sg_size * TN, N,
       layout::row_major, syclex::properties{syclex::prefetch_hint_L1});
   joint_matrix_fill(sg, sub_c, 1);
   for (int k = 0; k < K; k += TK) {
     joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K);
     joint_matrix_load(sg, sub_b,
-                      pB + k * N + sg_starty / SG_SZ * TN * vnniFactor,
+                      pB + k * N + sg_starty / sg_size * TN * vnniFactor,
                       N * vnniFactor);
     joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
   }
   joint_matrix_store(sg, sub_c,
-                     pC + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N,
+                     pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N,
                      layout::row_major);
 }
 
@@ -54,11 +57,14 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) {
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
 
+  size_t sg_size = get_sg_size<mult<vnniFactor>>(q);
   q.submit([&](handler &cgh) {
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+     cgh.parallel_for<mult<vnniFactor>>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
@@ -67,7 +73,7 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) {
 
            sub_group sg = spmd_item.get_sub_group();
            joint_matrix_gemm_vnni<T1, T2, T, M, N, K, B_layout, vnniFactor>(
-               sg, sg_startx, sg_starty, A, B, C);
+               sg, sg_startx, sg_starty, sg_size, A, B, C);
          }); // parallel for
    }).wait();
 }

From 3563d69469e20fc9c2e6a730bbf70f2d28ed534b Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 08:44:00 -0700
Subject: [PATCH 13/42] Fixed sycl merge and joint_matrix_prefetch

---
 sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
index 7b6e5b9a861ca..56c8af2325ac1 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
@@ -11,7 +11,7 @@
 #define TM 8
 #define TK 16
 
-template <unsigned int vnniFactor>
+template <layout B_layout, layout C_layout, unsigned int vnniFactor>
 class mult;
 
 template <typename T1, typename T2, typename T, size_t M, size_t N, size_t K,
@@ -69,9 +69,9 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) {
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
 
-  size_t sg_size = get_sg_size<mult<vnniFactor>>(q);
+  size_t sg_size = get_sg_size<mult<B_layout, C_layout, vnniFactor>>(q);
   q.submit([&](handler &cgh) {
-     cgh.parallel_for<mult<vnniFactor>>(
+     cgh.parallel_for<mult<B_layout, C_layout, vnniFactor>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item) 
 #ifdef SG_SZ

From c1bca5e1798362404bd55d36f7c6f5b1bcfd52bb Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 09:08:21 -0700
Subject: [PATCH 14/42] Fixed CPU tests

---
 sycl/test-e2e/Matrix/get_coord_int8_matB.cpp  |  1 -
 .../Matrix/get_coord_int8_matB_impl.hpp       | 95 ++++++++++---------
 .../joint_matrix_colA_rowB_colC_impl.hpp      | 17 ++--
 .../Matrix/joint_matrix_out_bounds_impl.hpp   | 24 +++--
 4 files changed, 73 insertions(+), 64 deletions(-)

diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index ad064fd82fc0a..feac65bf0e4bf 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -9,7 +9,6 @@
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: cpu
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
index d0f77a288d938..480f01ca77ceb 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
@@ -94,55 +94,65 @@ wi [1,0] -->    i=0, [8, 0]
 // clang-format on
 
 template <typename T, size_t K, size_t N>
-void matrix_sum_cols(queue q, big_matrix<T, K, N> &B,
-                     big_matrix<T, K / VF, N * VF> &Bvnni, nd_range<2> &r) {
+void matrix_sum_cols(big_matrix<T, K, N> &B,
+                     big_matrix<T, K / VF, N * VF> &Bvnni) {
   buffer<int8_t, 2> bufB(B.get_data(), range<2>(K, N));
   buffer<int8_t, 2> bufBvnni(Bvnni.get_data(), range<2>(K / VF, N * VF));
 
   int sum_cols[N] = {0};
   buffer<int> sum_cols_v(sum_cols, N);
 
+  size_t NDRangeK = K / TK;
+  size_t NDRangeN = N / TN;
+  queue q;
+  size_t sg_size = get_sg_size<class sum>(q);
+  nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size});
+
   q.submit([&](handler &cgh) {
      auto accB = bufBvnni.get_access<access::mode::read_write>(cgh);
      auto v = sum_cols_v.get_access<access::mode::atomic>(cgh);
 
-     cgh.parallel_for(r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(
-                             SG_SZ)]] {
-       const auto global_idx = spmd_item.get_global_id(0);
-       const auto global_idy = spmd_item.get_global_id(1);
-       const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-       const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-       sycl::sub_group sg = spmd_item.get_sub_group();
-
-       joint_matrix<sub_group, int8_t, use::b, TK, TN, layout::ext_intel_packed>
-           sub_b;
-
-       joint_matrix_load(sg, sub_b,
-                         accB.template get_multi_ptr<access::decorated::no>() +
-                             (sg_startx * (TK / VF) * N * VF) +
-                             sg_starty / SG_SZ * TN * VF,
-                         N * VF);
-
-       int32_t sum_local_cols[N] = {0};
-       ext::intel::experimental::matrix::joint_matrix_apply(
-           sg, sub_b, [&](int8_t &x, size_t row, size_t col) {
-             // the coordinates returned are in the logical range [K,N]
-             // If users want to retrieve the VNNIed coordinates, they can be
-             // obtained using
-             // colVNNI = col/VF
-             // rowVNNI = row*VF
-             size_t global_index = col + global_idy / SG_SZ * TN;
-             sum_local_cols[global_index] += x;
-           });
-
-       for (int i = 0; i < N; i++) {
-         sum_local_cols[i] =
-             reduce_over_group(sg, sum_local_cols[i], sycl::plus<>());
-         if (global_idy % SG_SZ == 0)
-           atomic_fetch_add(v[i], sum_local_cols[i]);
-       }
-     }); // parallel for
+     cgh.parallel_for<class sum>(
+         r, [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+                [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sycl::sub_group sg = spmd_item.get_sub_group();
+
+           joint_matrix<sub_group, int8_t, use::b, TK, TN,
+                        layout::ext_intel_packed>
+               sub_b;
+
+           joint_matrix_load(
+               sg, sub_b,
+               accB.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * (TK / VF) * N * VF) +
+                   sg_starty / sg_size * TN * VF,
+               N * VF);
+
+           int32_t sum_local_cols[N] = {0};
+           ext::intel::experimental::matrix::joint_matrix_apply(
+               sg, sub_b, [&](int8_t &x, size_t row, size_t col) {
+                 // the coordinates returned are in the logical range [K,N]
+                 // If users want to retrieve the VNNIed coordinates, they can
+                 // be obtained using colVNNI = col/VF rowVNNI = row*VF
+                 size_t global_index = col + global_idy / sg_size * TN;
+                 sum_local_cols[global_index] += x;
+               });
+
+           for (int i = 0; i < N; i++) {
+             sum_local_cols[i] =
+                 reduce_over_group(sg, sum_local_cols[i], sycl::plus<>());
+             if (global_idy % sg_size == 0)
+               atomic_fetch_add(v[i], sum_local_cols[i]);
+           }
+         }); // parallel for
    }).wait();
   sum_cols_ref<T, K, N>(bufB.get_host_access(), sum_cols_v.get_host_access());
 }
@@ -158,11 +168,6 @@ int main() {
   int8_t Bvnni[MATRIX_K / VF][MATRIX_N * VF];
   big_matrix<int8_t, MATRIX_K / VF, MATRIX_N * VF> MBvnni((int8_t *)&Bvnni);
 
-  size_t NDRangeK = MATRIX_K / TK;
-  size_t NDRangeN = MATRIX_N / TN;
-  queue q;
-  nd_range<2> r({NDRangeK, NDRangeN * SG_SZ}, {1, 1 * SG_SZ});
-
   for (int i = 0; i < MATRIX_K; i++) {
     for (int j = 0; j < MATRIX_N; j++) {
       B[i][j] = i + j;
@@ -170,7 +175,7 @@ int main() {
   }
   matrix_vnni<int8_t>(MATRIX_K, MATRIX_N, *B, *Bvnni, VF);
   // This test calculates sum of columns in the non VNNI B matrix
-  matrix_sum_cols<int8_t, MATRIX_K, MATRIX_N>(q, MB, MBvnni, r);
+  matrix_sum_cols<int8_t, MATRIX_K, MATRIX_N>(MB, MBvnni);
   std::cout << "Passed\n";
   return 0;
 }
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp
index 3347f9e7cc39e..ae0620f3c5459 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp
@@ -10,8 +10,6 @@
 #include <random>
 #include <sycl/usm.hpp>
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t TM = 8;
 constexpr size_t TK = 16;
@@ -27,12 +25,15 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
   assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B);
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
+  size_t sg_size = get_sg_size<class mult>(q);
 
   q.submit([&](handler &cgh) {
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+     cgh.parallel_for<class mult>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            auto pA =
                address_space_cast<sycl::access::address_space::global_space,
@@ -61,12 +62,12 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
            joint_matrix_fill(sg, sub_c, 1);
            for (int k = 0; k < K; k += TK) {
              joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K);
-             joint_matrix_load(sg, sub_b, pB + k * N + sg_starty / SG_SZ * TN,
+             joint_matrix_load(sg, sub_b, pB + k * N + sg_starty / sg_size * TN,
                                N);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
-               sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N,
+               sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N,
                layout::col_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp
index bda9f70b296d3..d186fdcad03a4 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp
@@ -9,12 +9,12 @@
 #include <iostream>
 #include <sycl/usm.hpp>
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TM = 8;
 constexpr size_t TK = 16;
 
+template <layout B_layout, unsigned int vnniFactor>
+class mult;
+
 template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
           size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
           size_t NUM_COLS_C, layout B_layout, unsigned int vnniFactor>
@@ -27,11 +27,15 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
   // Add one iteration for the out of bounds dpas instruction
   size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0);
   size_t NDRangeN = N / TN;
-  q.submit([&](handler &cgh) {
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
+  size_t sg_size = get_sg_size<mult<B_layout, vnniFactor>>(q);
 
+  q.submit([&](handler &cgh) {
+     cgh.parallel_for<mult<B_layout, vnniFactor>>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            auto pA =
                address_space_cast<sycl::access::address_space::global_space,
@@ -60,7 +64,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
            joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
            // bounds-checked load where width and height are added
            ext::intel::experimental::matrix::joint_matrix_fill_checked(
-               sg, sub_c, 1, N, M, N, sg_startx * TM, sg_starty / SG_SZ * TN);
+               sg, sub_c, 1, N, M, N, sg_startx * TM, sg_starty / sg_size * TN);
            for (int k = 0; k < K; k += TK) {
              // bounds-checked load where width and height are added
              ext::intel::experimental::matrix::joint_matrix_load_checked(
@@ -69,13 +73,13 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
              // bounds-checked load where width and height are added
              ext::intel::experimental::matrix::joint_matrix_load_checked(
                  sg, sub_b, pB, N * vnniFactor, K / vnniFactor, N * vnniFactor,
-                 k, sg_starty / SG_SZ * TN * vnniFactor);
+                 k, sg_starty / sg_size * TN * vnniFactor);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            // bounds-checked store where width and height are added
            ext::intel::experimental::matrix::joint_matrix_store_checked(
                sg, sub_c, pC, N, layout::row_major, M, N, sg_startx * TM,
-               sg_starty / SG_SZ * TN);
+               sg_starty / sg_size * TN);
          }); // parallel for
    }).wait();
 }

From 3fe79dafd3856a3043ca347f8324817025935e32 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 09:20:04 -0700
Subject: [PATCH 15/42] clang-format

---
 .../Matrix/elemwise_irreg_size_ops_bf16.cpp       |  2 +-
 .../Matrix/joint_matrix_annotated_ptr_impl.hpp    | 13 +++++++------
 ...t_matrix_bfloat16_colmajorA_colmajorB_impl.hpp |  2 +-
 .../Matrix/joint_matrix_out_bounds_impl.hpp       |  5 ++---
 .../Matrix/joint_matrix_prefetch_impl.hpp         | 15 +++++++--------
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
index 4d453fe35da5c..10c5e195a5344 100644
--- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
+++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
@@ -14,9 +14,9 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
+#include "common.hpp"
 #include <iostream>
 #include <sycl/sycl.hpp>
-#include "common.hpp"
 
 // 10x12 is not multiply the sg size, slicing implementation will have to insert
 // padding
diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp
index 2eeba80572608..5463ea040d1eb 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp
@@ -11,8 +11,7 @@
 #define TM 8
 #define TK 16
 
-template <unsigned int vnniFactor>
-class mult;
+template <unsigned int vnniFactor> class mult;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           unsigned int vnniFactor>
@@ -58,15 +57,17 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) {
                syclex::properties{syclintelex::read_hint<
                    syclintelex::cache_control<syclintelex::cache_mode::cached,
                                               syclex::cache_level::L2>>}};
-           joint_matrix_load(
-               sg, sub_c, C_ptr + (sg_startx * TM) * N + sg_starty / sg_size * TN,
-               N, layout::row_major);
+           joint_matrix_load(sg, sub_c,
+                             C_ptr + (sg_startx * TM) * N +
+                                 sg_starty / sg_size * TN,
+                             N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(sg, sub_a, A_ptr + (sg_startx * TM) * K + k * TK,
                                K);
              if constexpr (vnniFactor == 0) {
                joint_matrix_load(
-                   sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN, N);
+                   sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN,
+                   N);
                joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
              } else {
                joint_matrix_load(sg, sub_bp,
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp
index 6771795c70a0b..e3234da2cd5d9 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp
@@ -27,7 +27,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp
index d186fdcad03a4..4be21beea9c45 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp
@@ -12,8 +12,7 @@
 constexpr size_t TM = 8;
 constexpr size_t TK = 16;
 
-template <layout B_layout, unsigned int vnniFactor>
-class mult;
+template <layout B_layout, unsigned int vnniFactor> class mult;
 
 template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
           size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
@@ -32,7 +31,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
   q.submit([&](handler &cgh) {
      cgh.parallel_for<mult<B_layout, vnniFactor>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
index 56c8af2325ac1..9d9c99bf4ae1a 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp
@@ -11,8 +11,7 @@
 #define TM 8
 #define TK 16
 
-template <layout B_layout, layout C_layout, unsigned int vnniFactor>
-class mult;
+template <layout B_layout, layout C_layout, unsigned int vnniFactor> class mult;
 
 template <typename T1, typename T2, typename T, size_t M, size_t N, size_t K,
           layout B_layout, layout C_layout, unsigned int vnniFactor>
@@ -43,7 +42,7 @@ void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty,
     joint_matrix_prefetch<TM, TN>(
         sg, C + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, C_layout,
         syclex::properties{syclex::prefetch_hint_L1});
-            
+
   joint_matrix_fill(sg, sub_c, 1);
   for (int k = 0; k < K; k += TK) {
     joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K);
@@ -55,8 +54,8 @@ void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty,
 
   if constexpr (C_layout == layout::col_major)
     joint_matrix_store(sg, sub_c,
-                       pC + (sg_starty / sg_size * TN) * M + (sg_startx * TM), M,
-                       C_layout);
+                       pC + (sg_starty / sg_size * TN) * M + (sg_startx * TM),
+                       M, C_layout);
   else
     joint_matrix_store(sg, sub_c,
                        pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N,
@@ -73,7 +72,7 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) {
   q.submit([&](handler &cgh) {
      cgh.parallel_for<mult<B_layout, C_layout, vnniFactor>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
@@ -85,8 +84,8 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) {
 
            sub_group sg = spmd_item.get_sub_group();
            joint_matrix_gemm_vnni<T1, T2, T, M, N, K, B_layout, C_layout,
-                                  vnniFactor>(sg, sg_startx, sg_starty, sg_size, A, B,
-                                              C);
+                                  vnniFactor>(sg, sg_startx, sg_starty, sg_size,
+                                              A, B, C);
          }); // parallel for
    }).wait();
 }

From 8dc3756d31095fc476d7dcb2a56e9e45caafdf35 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 09:44:00 -0700
Subject: [PATCH 16/42] clang-format

---
 sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp
index ae0620f3c5459..373ec652cc063 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp
@@ -10,7 +10,6 @@
 #include <random>
 #include <sycl/usm.hpp>
 
-
 constexpr size_t TM = 8;
 constexpr size_t TK = 16;
 
@@ -30,7 +29,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
   q.submit([&](handler &cgh) {
      cgh.parallel_for<class mult>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
@@ -67,8 +66,8 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) {
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
-               sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N,
-               layout::col_major);
+               sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::col_major);
          }); // parallel for
    }).wait();
 }

From 516f6484c0d1409154889179d603575d02bdbd9f Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 12:45:24 -0700
Subject: [PATCH 17/42] Undo changes

---
 .../Matrix/elemwise_irreg_size_ops_bf16.cpp   | 19 ++++++++++---------
 sycl/test-e2e/Matrix/get_coord_int8_matB.cpp  |  1 +
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
index 10c5e195a5344..6a4632305efb3 100644
--- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
+++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
@@ -16,7 +16,10 @@
 
 #include "common.hpp"
 #include <iostream>
-#include <sycl/sycl.hpp>
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/matrix/matrix.hpp>
+
+#define SG_SZ 16
 
 // 10x12 is not multiply the sg size, slicing implementation will have to insert
 // padding
@@ -42,18 +45,16 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
-  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item)
-#ifdef SG_SZ
+         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
+         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
              [[intel::reqd_sub_group_size(SG_SZ)]]
-#endif
+
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -74,7 +75,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
                N, layout::row_major);
            for (int k = 0; k < K; k += TK) {
              joint_matrix_load(
@@ -86,7 +87,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k) * (N) + sg_starty / sg_size * TN * 2,
+                     (k) * (N) + sg_starty / SG_SZ * TN * 2,
                  N * 2);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
@@ -94,7 +95,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index feac65bf0e4bf..ad064fd82fc0a 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -9,6 +9,7 @@
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
+// XFAIL: cpu
 
 #include "common.hpp"
 

From b48d61d81527d393e0218cf911a6e429135be437 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 14 May 2024 14:13:38 -0700
Subject: [PATCH 18/42] SG32 #define SG_SZ

---
 sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp                  | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp  | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp            | 2 +-
 sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp               | 2 +-
 sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp               | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp            | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp           | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp    | 2 +-
 .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp         | 2 +-
 .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp       | 2 +-
 .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp  | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp             | 2 +-
 .../Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp   | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp       | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp         | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp                 | 2 +-
 .../Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp       | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp            | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp           | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp                 | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp           | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp          | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp              | 2 +-
 sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp              | 2 +-
 30 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
index 182ec8e81233d..4833404610369 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
index 7b9655fe62416..3916aaff03867 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
index e88f0a0a135f5..ddfa39c541c0a 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
index 8a91d404f6948..ad644c8734475 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp
index 06c1f5d3f5c96..06d459a2a3ce5 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp
index 4824ff2568d30..4624110577ea2 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp
@@ -13,6 +13,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 
 #include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
index 3bdd2ed83b08d..9d38fb7afa30d 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp
@@ -17,7 +17,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
index 79383fce4b7fc..13d8df56f40a1 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp
@@ -17,7 +17,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
index 78b95b3ee53d6..5b77ec89fd997 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
@@ -18,7 +18,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp
index b9660e73e3ab2..46de02fe8f525 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp
@@ -17,7 +17,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 // Sub-matrix N dimension
 static constexpr size_t SN = 16;
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp
index 0ce3d22bc873b..c38d8f133264d 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_apply_bf16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
index 7040058dc8554..b93985f8e594e 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
@@ -19,7 +19,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
index 3a023df7b10f8..10391f2e7e319 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -16,7 +16,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 1b7a8ed351139..994a2217d681f 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -21,7 +21,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 8c52421657229..4f7e3638daaf3 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -18,7 +18,7 @@
 #include "../common.hpp"
 #include <cstddef>
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
index fc7d0c9e4eba2..2ea58e9953917 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp
@@ -20,7 +20,7 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 using bfloat16 = sycl::ext::oneapi::bfloat16;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 06798015261e7..6532bcfe47bff 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -25,7 +25,7 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 using bfloat16 = sycl::ext::oneapi::bfloat16;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
index e2158368ff6f8..70e53441cb48f 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_colA_rowB_colC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp
index 52d8bc9c6f4a4..b474f846d11d5 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp
@@ -13,6 +13,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 
 #include "../joint_matrix_down_convert_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
index cb7b15819f2bb..f4dd217655439 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp
@@ -18,7 +18,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
index d7289579098e9..c89c657c77fbc 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -21,7 +21,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_int8_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp
index 09c4d6059750c..c8ee58e126732 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp
@@ -18,7 +18,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_int8_vnni_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
index ed7fb96ca104a..1848a480a0eb7 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 constexpr size_t MATRIX_K = 1024 + 24;
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp
index 6b059ed357781..b193d422c2b8c 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_ss_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp
index 5a13d4c1f1807..cfd89fcb8a1bf 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_su_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp
index 9a82aa8bb647a..18da250bc808d 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
index 504e7beac85e3..214dd10f5158f 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
@@ -13,7 +13,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
index 3532e5cc4e3ba..f4b2426af93a8 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp
@@ -15,7 +15,7 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 static constexpr size_t MATRIX_K = 1024 + 14;
 
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp
index a4292269811f1..aec91f70bd1d7 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_us_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp
index 842977311cafa..b2d6510622736 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp
@@ -16,7 +16,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 32;
+#define SG_SZ 32
 constexpr size_t TN = 16;
 
 #include "../joint_matrix_uu_int8_impl.hpp"

From c23311c62be3f852f71f18798fef79d3f4226699 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 14 May 2024 14:19:53 -0700
Subject: [PATCH 19/42] XMX8 no SG_SZ

---
 sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp                   | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp          | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp          | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp   | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp             | 1 -
 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp    | 1 -
 sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp                | 1 -
 sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp                | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp             | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp            | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp     | 1 -
 .../test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp | 1 -
 .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp        | 1 -
 .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp   | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp              | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp        | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp        | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp        | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp                  | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp             | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp    | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp            | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp            | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp           | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp               | 1 -
 sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp               | 1 -
 29 files changed, 29 deletions(-)

diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
index aa2d2e28ac468..d7df42000249a 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
index f360bdbba6ada..826b99dfcf306 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
index 6f3aedfe506d5..a39cb6664d100 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
index ca425f7ded5d1..9ff39c8d516d0 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
index b9d49bba70abb..5bae6a3184808 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
@@ -13,7 +13,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
index 2975ab9edf6c4..87adf891cd16b 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
@@ -16,7 +16,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp
index 5aa1cd8a2a0d7..d86af51e3cd86 100644
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp
@@ -18,7 +18,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp
index ece88423d0f43..e815b46e1ed21 100644
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp
@@ -18,7 +18,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
index a84580c3f846c..4c4d6c6eb5765 100644
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp
index be1ac0f24e88c..32b8c3bc6e24f 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t SN = 8;
 
 #include "../joint_matrix_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp
index f02028d31e7ed..614a67db9ff8a 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_apply_bf16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
index b52e8085be172..fbcd21be62f75 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
@@ -15,7 +15,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
index 2e05e656e5379..c5e399bc98f48 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -15,7 +15,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 18238e4896ccb..ba24ea0dfc4b8 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -17,7 +17,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 49b5e6eebb4ac..9d88c89c50f41 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -17,7 +17,6 @@
 #include "../common.hpp"
 #include <cstddef>
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp
index 008db77761e3d..173ac16a42afc 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
index b72e2ed83841c..5a41f19bc2ac1 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_bfloat16_32x64_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
index e6371806f3592..09c1a4ae32a92 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 static constexpr int TN = 8;
 
 #include "../joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
index 494a84c173edb..7d74bf8055d6b 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_colA_rowB_colC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp
index dbe060711b02a..419cc936f14e4 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp
index 728a057aedaa7..3dadaeebee511 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp
@@ -12,7 +12,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_int8_vnni_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
index 532af4dc5d844..07a48bd44fccd 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
@@ -8,7 +8,6 @@
 
 #include "../common.hpp"
 
-#define SG_SZ 8
 constexpr size_t SN = 8;
 
 #include "../joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
index 944cccd310d3e..0ba69032465b9 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 static constexpr size_t MATRIX_K = 1024 + 24;
 
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp
index 4a3770be74f91..fbd97d215498d 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_ss_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp
index d5c7a74c20aff..2694d0135c6a1 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_su_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
index 672e8b87e22e6..a0a98e3f16d0c 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
@@ -13,7 +13,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
index aa8e00c08b658..f42f37378514d 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
@@ -14,7 +14,6 @@
 
 #include "../common.hpp"
 
-constexpr size_t SG_SZ = 8;
 constexpr size_t TN = 8;
 constexpr size_t MATRIX_K = 1024 + 14;
 
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp
index 56feaaec924ad..0c5f46f6fcec6 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_us_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp
index a1643332e489f..bc08632463f22 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 8
 constexpr size_t TN = 8;
 
 #include "../joint_matrix_uu_int8_impl.hpp"

From db8cd7ee26e1abc33bbb0b52623e669caeffc709 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 15 May 2024 07:39:06 -0700
Subject: [PATCH 20/42] WIP abc_impl: remove SG_SZ

---
 sycl/test-e2e/Matrix/element_wise_abc.cpp        |  1 -
 sycl/test-e2e/Matrix/element_wise_abc_impl.hpp   | 16 ++++++++++++----
 .../Matrix/element_wise_all_ops_half.cpp         |  1 -
 .../Matrix/element_wise_all_ops_int8.cpp         |  1 -
 .../Matrix/element_wise_all_ops_int8_packed.cpp  |  1 -
 .../Matrix/element_wise_all_ops_tf32.cpp         |  1 -
 sycl/test-e2e/Matrix/element_wise_all_sizes.cpp  |  3 ---
 .../Matrix/elemwise_irreg_size_ops_bf16.cpp      |  2 --
 sycl/test-e2e/Matrix/get_coord_float_matC.cpp    |  1 -
 sycl/test-e2e/Matrix/get_coord_int8_matA.cpp     |  1 -
 sycl/test-e2e/Matrix/get_coord_int8_matB.cpp     |  1 -
 sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp  |  1 -
 .../Matrix/joint_matrix_annotated_ptr.cpp        |  1 -
 sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp |  1 -
 .../Matrix/joint_matrix_bf16_fill_k_cache.cpp    |  1 -
 .../joint_matrix_bf16_fill_k_cache_init.cpp      |  2 +-
 .../joint_matrix_bf16_fill_k_cache_unroll.cpp    |  1 -
 ...oint_matrix_bf16_fill_k_cache_unroll_init.cpp |  1 -
 sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp   |  1 -
 .../Matrix/joint_matrix_bfloat16_array.cpp       |  2 +-
 ...joint_matrix_bfloat16_colmajorA_colmajorB.cpp |  1 -
 .../Matrix/joint_matrix_bfloat16_packedB.cpp     |  2 --
 .../Matrix/joint_matrix_colA_rowB_colC.cpp       |  1 -
 .../Matrix/joint_matrix_down_convert.cpp         |  3 ---
 sycl/test-e2e/Matrix/joint_matrix_half.cpp       |  1 -
 .../joint_matrix_int8_colmajorA_colmajorB.cpp    |  1 -
 sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp  |  1 -
 .../Matrix/joint_matrix_opt_kernel_feature.cpp   |  1 -
 sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp |  1 -
 sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp   |  1 -
 .../Matrix/joint_matrix_rowmajorA_rowmajorB.cpp  |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp    |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp    |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_tf32.cpp       |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp |  1 -
 .../test-e2e/Matrix/joint_matrix_unaligned_k.cpp |  1 -
 sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp    |  4 ----
 sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp    |  4 ----
 38 files changed, 14 insertions(+), 65 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/element_wise_abc.cpp
index c9954fee4f898..0a6a4e4abaa03 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index bf8b2ecb4df85..8c08bfad7a867 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -15,6 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TM 8
 #define TK 32
+class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
@@ -27,14 +28,21 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  std::cout << "Artem: before get_sg_size()\n";
+  size_t sg_size = get_sg_size<add>(q);
+  std::cout << "Artem: after get_sg_size()\n";
   q.submit([&](handler &cgh) {
      accessor accC{bufC, cgh};
      accessor accA{bufA, cgh};
      accessor accB{bufB, cgh};
 
      cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -60,14 +68,14 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_b,
                accB.template get_multi_ptr<access::decorated::no>() +
-                   sg_starty / SG_SZ * TN * vnniFactor,
+                   sg_starty / sg_size * TN * vnniFactor,
                N * vnniFactor);
            joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; });
 
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; });
          }); // parallel for
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
index fae692ff39ed9..c07d19ed73f2e 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
index 93ddcefc19ac3..e1a2cf4eecfa1 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
index 2d79d945e8980..24f82f47e8fcd 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp
index 28483b5c2092e..6e2f8dcff6384 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp
@@ -13,7 +13,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "element_wise_all_ops_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
index 661027237f836..1c07e494fcc47 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
@@ -12,7 +12,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-#define SG_SZ 16
-
 #include "element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
index 5d923f4fb908d..7abcf7f69ab4b 100644
--- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
+++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp
@@ -22,8 +22,6 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 using bfloat16 = sycl::ext::oneapi::bfloat16;
 
-#define SG_SZ 16
-
 // 10x12 is not multiply the sg size, slicing implementation will have to insert
 // padding
 #define TM 10
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
index 78a6f815df19c..57c9a00d98fd4 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
@@ -16,7 +16,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "get_coord_float_matC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
index 6500a34f48119..67fa811f2d764 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
@@ -16,7 +16,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "get_coord_int8_matA_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index 9fda659505c43..782df68d21ff5 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
index 408a6087206ea..0eb13cf57347c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 // Sub-matrix N dimension
 static constexpr size_t SN = 16;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp
index 265532c140e12..7aad02b2066a1 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_annotated_ptr_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp
index 82ad15285a4fa..d58677fa2c178 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_apply_bf16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
index 0c93876db2a15..abee7d7259f28 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
@@ -18,7 +18,6 @@
 #include "common.hpp"
 #include <cstddef>
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
index 7206cb165349b..d839f3db8f481 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -14,7 +14,7 @@
 
 #include "common.hpp"
 #include <cstddef>
-#define SG_SZ 16
+
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 5518d9cb08fbc..1800901e24111 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -20,7 +20,6 @@
 #include "common.hpp"
 #include <cstddef>
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index a393f3a2ad729..701c17741f576 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -17,7 +17,6 @@
 #include "common.hpp"
 #include <cstddef>
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
index d1410ac68276e..2222cbb605a15 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
index 80e1f310ce440..98ed155b297ad 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
@@ -11,7 +11,7 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-#define SG_SZ 16
+
 static constexpr int TN = 16;
 
 #include "joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 9cd31a8c5178e..19d12915b4a95 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -20,7 +20,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
index 3e80168752545..0d592e04b606c 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp
@@ -12,6 +12,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-#define SG_SZ 16
 #include "joint_matrix_bfloat16_packedB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
index 7d114175dff13..354a71006e129 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "joint_matrix_colA_rowB_colC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp
index caea640677aa7..dee504c22e7f6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp
@@ -11,7 +11,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t SG_SZ = 16;
-
 #include "joint_matrix_down_convert_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
index ac09361a0799c..9281e47f572d2 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
@@ -17,7 +17,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
index 33c00022a5a76..fb29cc2baaf74 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -20,7 +20,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
index 02813c6720deb..8dcddb841721d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
@@ -15,7 +15,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_vnni_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
index 6195ee2935892..031c7753de425 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
@@ -16,7 +16,6 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 static constexpr size_t SN = 16;
 
 #include "joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
index a5302b9ee7a57..f3485408373b9 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp
@@ -16,7 +16,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 constexpr size_t MATRIX_K = 1024 + 24;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp
index 30d9278e07157..7abea83c6d287 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp
@@ -13,6 +13,5 @@
 
 #include "common.hpp"
 
-#define SG_SZ 16
 constexpr size_t TN = 16;
 #include "joint_matrix_prefetch_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp
index 958bd94fe0cd3..77df6085bc09a 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp
@@ -16,8 +16,4 @@
 // transform. This is currently only available on AMX and XMX of PVC
 
 #include "common.hpp"
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 #include "joint_matrix_rowmajorA_rowmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp
index e487b8cdcb41d..2089e0185b0e0 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_ss_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp
index 72910c4ed5446..7a02d03b9d642 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_su_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp
index 6f34a4acbea61..922b79f356e78 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp
@@ -13,10 +13,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_tf32_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
index f98c8bd3c7b48..bd04b157cf667 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
@@ -12,7 +12,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 
 #include "joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
index 212ac34a3a640..e1cf6cb6cf8bb 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp
@@ -14,7 +14,6 @@
 
 #include "common.hpp"
 
-constexpr size_t SG_SZ = 16;
 constexpr size_t TN = 16;
 static constexpr size_t MATRIX_K = 1024 + 14;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp
index 409b589904847..f4237b995aad8 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_us_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp
index 59a47484a335c..a75d18b9e6967 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp
@@ -12,10 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define SG_SZ 16
 constexpr size_t TN = 16;
 
 #include "joint_matrix_uu_int8_impl.hpp"

From 5877ed2b40f1082a656eb37253f4852cebb302f8 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 15 May 2024 15:21:26 -0700
Subject: [PATCH 21/42] Made tests independant of SG_SZ

---
 .../XMX8/joint_matrix_opt_kernel_feature.cpp  |  3 -
 .../test-e2e/Matrix/element_wise_abc_impl.hpp |  8 +-
 .../Matrix/element_wise_all_sizes_impl.hpp    | 54 ++++++-----
 .../Matrix/get_coord_float_matC_impl.hpp      | 15 ++--
 .../Matrix/get_coord_int8_matA_impl.hpp       | 73 ++++++++-------
 .../Matrix/joint_matrix_all_sizes_impl.hpp    | 57 ++++++------
 .../Matrix/joint_matrix_apply_bf16_impl.hpp   | 57 ++++++------
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   | 17 ++--
 .../joint_matrix_bfloat16_array_impl.hpp      | 15 ++--
 .../Matrix/joint_matrix_bfloat16_impl.hpp     | 15 ++--
 .../joint_matrix_bfloat16_packedB_impl.hpp    | 15 ++--
 .../Matrix/joint_matrix_down_convert_impl.hpp | 15 ++--
 .../Matrix/joint_matrix_half_impl.hpp         | 90 ++++++++++---------
 .../Matrix/joint_matrix_int8_vnni_impl.hpp    | 14 +--
 .../joint_matrix_opt_kernel_feature.cpp       |  3 -
 .../joint_matrix_opt_kernel_feature_impl.hpp  | 18 ++--
 .../Matrix/joint_matrix_ss_int8_impl.hpp      | 14 +--
 .../Matrix/joint_matrix_su_int8_impl.hpp      | 16 ++--
 .../Matrix/joint_matrix_tf32_impl.hpp         | 15 ++--
 .../Matrix/joint_matrix_transposeC_impl.hpp   | 89 +++++++++---------
 .../Matrix/joint_matrix_us_int8_impl.hpp      | 14 +--
 .../Matrix/joint_matrix_uu_int8_impl.hpp      | 16 ++--
 22 files changed, 354 insertions(+), 279 deletions(-)

diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
index 07a48bd44fccd..30b3522ad2442 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
@@ -7,7 +7,4 @@
 // incompatible on the current device
 
 #include "../common.hpp"
-
-constexpr size_t SN = 8;
-
 #include "../joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index 8c08bfad7a867..dea0cf882eaaf 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -15,7 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TM 8
 #define TK 32
-class add;
+// class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
@@ -28,15 +28,13 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
-  std::cout << "Artem: before get_sg_size()\n";
-  size_t sg_size = get_sg_size<add>(q);
-  std::cout << "Artem: after get_sg_size()\n";
+  size_t sg_size = get_sg_size<class add>(q);
   q.submit([&](handler &cgh) {
      accessor accC{bufC, cgh};
      accessor accA{bufA, cgh};
      accessor accB{bufB, cgh};
 
-     cgh.parallel_for(
+     cgh.parallel_for<class add>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
index 4020e8b84bbd2..5800ab9c62745 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
@@ -23,7 +23,7 @@ void assert_ops_ref(host_accessor<T, 2, access::mode::read_write> C,
     }
 }
 
-template <typename T, typename T1, size_t TM, size_t TK>
+template <typename T, typename T1, size_t TM, size_t TK, typename kernel_name>
 void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   static constexpr size_t M = TM * M_MULTIPLIER;
   static constexpr size_t K = 128;
@@ -32,7 +32,8 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   size_t NDRangeM = M / TM;
   size_t NDRangeK = K / TK;
   queue q;
-  nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ});
+  size_t sg_size = get_sg_size<kernel_name>(q);
+  nd_range<2> r({NDRangeM, NDRangeK * sg_size}, {1, 1 * sg_size});
   big_matrix<T, M, K> A((T *)&MatA);
 
   buffer<T, 2> bufA(A.get_data(), range<2>(M, K));
@@ -40,8 +41,12 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   q.submit([&](handler &cgh) {
      sycl::accessor accA{bufA, cgh, sycl::read_write};
 
-     cgh.parallel_for(
-         r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<kernel_name>(
+         r, [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+                [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
@@ -57,41 +62,42 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_a,
                accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * K + sg_starty / SG_SZ * TK,
+                   (sg_startx * TM) * K + sg_starty / sg_size * TK,
                K);
          }); // parallel for
    }).wait();
   assert_ops_ref<T, M, K>(bufA.get_host_access(), result);
 }
 
-template <typename Ta, size_t tM, size_t tK> void add_ref() {
+template <typename Ta, size_t tM, size_t tK, typename kernel_name>
+void add_ref() {
   if constexpr (std::is_same_v<Ta, bfloat16>) {
     // Tests whether 5 + 2 = 7 operation is successful.
-    matrix_verify_add<bfloat16, bfloat16, tM, tK>(bfloat16(5.0), bfloat16(2.0),
-                                                  bfloat16(7.0));
+    matrix_verify_add<bfloat16, bfloat16, tM, tK, kernel_name>(
+        bfloat16(5.0), bfloat16(2.0), bfloat16(7.0));
   }
   if constexpr (std::is_same_v<Ta, int8_t>) {
-    matrix_verify_add<int8_t, int, tM, tK>(5 /*val1*/, 2 /*val2*/,
-                                           7 /*result*/);
+    matrix_verify_add<int8_t, int, tM, tK, kernel_name>(5 /*val1*/, 2 /*val2*/,
+                                                        7 /*result*/);
   }
 }
 
 int main() {
-  add_ref<bfloat16, 1 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 2 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 3 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 4 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 5 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 6 /*TM*/, 16 /*TK*/>();
-  add_ref<bfloat16, 7 /*TM*/, 16 /*TK*/>();
+  add_ref<bfloat16, 1 /*TM*/, 16 /*TK*/, class test_bfloat16_1>();
+  add_ref<bfloat16, 2 /*TM*/, 16 /*TK*/, class test_bfloat16_2>();
+  add_ref<bfloat16, 3 /*TM*/, 16 /*TK*/, class test_bfloat16_3>();
+  add_ref<bfloat16, 4 /*TM*/, 16 /*TK*/, class test_bfloat16_4>();
+  add_ref<bfloat16, 5 /*TM*/, 16 /*TK*/, class test_bfloat16_5>();
+  add_ref<bfloat16, 6 /*TM*/, 16 /*TK*/, class test_bfloat16_6>();
+  add_ref<bfloat16, 7 /*TM*/, 16 /*TK*/, class test_bfloat16_7>();
 
-  add_ref<int8_t, 1 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 2 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 3 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 4 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 5 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 6 /*TM*/, 32 /*TK*/>();
-  add_ref<int8_t, 7 /*TM*/, 32 /*TK*/>();
+  add_ref<int8_t, 1 /*TM*/, 32 /*TK*/, class test_int8_1>();
+  add_ref<int8_t, 2 /*TM*/, 32 /*TK*/, class test_int8_2>();
+  add_ref<int8_t, 3 /*TM*/, 32 /*TK*/, class test_int8_3>();
+  add_ref<int8_t, 4 /*TM*/, 32 /*TK*/, class test_int8_4>();
+  add_ref<int8_t, 5 /*TM*/, 32 /*TK*/, class test_int8_5>();
+  add_ref<int8_t, 6 /*TM*/, 32 /*TK*/, class test_int8_6>();
+  add_ref<int8_t, 7 /*TM*/, 32 /*TK*/, class test_int8_7>();
 
   std::cout << "Passed\n";
 }
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
index bedc91bdc39d4..b424a01a7c6a6 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
@@ -28,13 +28,18 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
   buffer<float> sum_rows_v(sum_rows, M);
 
   queue q;
+  size_t sg_size = get_sg_size<class add>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto v = sum_rows_v.get_access<access::mode::read_write>(cgh);
 
-     cgh.parallel_for(
-         nd_range<2>({M / TM, N / TN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class add>(
+         nd_range<2>({M / TM, N / TN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+           {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -49,7 +54,7 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
 
            float sum_local_rows[M] = {0};
@@ -62,7 +67,7 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
              sum_local_rows[i] =
                  reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
              // only Groups leader perform the global reduction
-             if (global_idy % SG_SZ == 0) {
+             if (global_idy % sg_size == 0) {
                sycl::atomic_ref<float, sycl::memory_order::relaxed,
                                 sycl::memory_scope::device>
                    aref(v[i]);
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
index afda0f90a6e37..6f57ab5b4e63c 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
@@ -72,45 +72,54 @@ W0 --> 0 0 1 1 2 2 3 3 .... 7 7
 // clang-format on
 
 template <typename T, size_t M, size_t K>
-void matrix_sum_rows(queue q, big_matrix<T, M, K> &A, nd_range<2> &r) {
+void matrix_sum_rows(big_matrix<T, M, K> &A) {
   buffer<int8_t, 2> bufA(A.get_data(), range<2>(M, K));
 
   // size of vector is equal to number of rows in big matrix
   int sum_rows[M] = {0};
   buffer<int> sum_rows_v(sum_rows, M);
+  queue q;
+  size_t sg_size = get_sg_size<class add>(q);
+  nd_range<2> r({M / TM, K / TK * sg_size}, {1, 1 * sg_size});
   q.submit([&](handler &cgh) {
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto v = sum_rows_v.get_access<access::mode::atomic>(cgh);
 
-     cgh.parallel_for(r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(
-                             SG_SZ)]] {
-       const auto global_idx = spmd_item.get_global_id(0);
-       const auto global_idy = spmd_item.get_global_id(1);
-       const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-       const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-       sycl::sub_group sg = spmd_item.get_sub_group();
-       joint_matrix<sub_group, int8_t, use::a, TM, TK, layout::row_major> sub_a;
-       joint_matrix_load(sg, sub_a,
-                         accA.template get_multi_ptr<access::decorated::no>() +
-                             (sg_startx * TM * K) + sg_starty / SG_SZ * TK,
-                         K);
-
-       int32_t sum_local_rows[M] = {0};
-
-       ext::intel::experimental::matrix::joint_matrix_apply(
-           sg, sub_a, [&](int8_t &x, size_t row, size_t col) {
-             sum_local_rows[row + global_idx * TM] += x;
-           });
-       for (int i = 0; i < M; ++i) {
-         sum_local_rows[i] =
-             reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
-
-         // only Groups leader performs the global reduction
-         if (global_idy % SG_SZ == 0)
-           atomic_fetch_add(v[i], sum_local_rows[i]);
-       }
-     }); // parallel for
+     cgh.parallel_for<class add>(
+         r, [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+                [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sycl::sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, int8_t, use::a, TM, TK, layout::row_major>
+               sub_a;
+           joint_matrix_load(
+               sg, sub_a,
+               accA.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM * K) + sg_starty / sg_size * TK,
+               K);
+
+           int32_t sum_local_rows[M] = {0};
+
+           ext::intel::experimental::matrix::joint_matrix_apply(
+               sg, sub_a, [&](int8_t &x, size_t row, size_t col) {
+                 sum_local_rows[row + global_idx * TM] += x;
+               });
+           for (int i = 0; i < M; ++i) {
+             sum_local_rows[i] =
+                 reduce_over_group(sg, sum_local_rows[i], sycl::plus<>());
+
+             // only Groups leader performs the global reduction
+             if (global_idy % sg_size == 0)
+               atomic_fetch_add(v[i], sum_local_rows[i]);
+           }
+         }); // parallel for
    }).wait();
   sum_rows_ref<T, M, K>(bufA.get_host_access(), sum_rows_v.get_host_access());
 }
@@ -124,8 +133,6 @@ int main() {
 
   size_t NDRangeM = MATRIX_M / TM;
   size_t NDRangeK = MATRIX_K / TK;
-  queue q;
-  nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ});
 
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
@@ -133,7 +140,7 @@ int main() {
     }
   }
 
-  matrix_sum_rows<int8_t, MATRIX_M, MATRIX_K>(q, MA, r);
+  matrix_sum_rows<int8_t, MATRIX_M, MATRIX_K>(MA);
   std::cout << "Passed\n";
   return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
index edfcfe1d2e979..8e9880235c2b2 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
@@ -9,7 +9,7 @@
 static constexpr size_t M_MULTIPLIER = 16;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
-          int vnniFactor, size_t TM, size_t TN, size_t TK>
+          int vnniFactor, size_t TM, size_t TN, size_t TK, typename kernel_name>
 void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                      big_matrix<T2, K / vnniFactor, N * vnniFactor> &B) {
   size_t NDRangeM = M / TM;
@@ -19,15 +19,18 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
   q.submit([&](handler &cgh) {
      sycl::accessor accC{bufC, cgh, sycl::read_write};
      sycl::accessor accA{bufA, cgh, sycl::read_only};
      sycl::accessor accB{bufB, cgh, sycl::read_only};
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+     cgh.parallel_for<kernel_name>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -47,7 +50,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -59,21 +62,21 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
                      (k * TK / vnniFactor) * (N * vnniFactor) +
-                     sg_starty / SG_SZ * TN * vnniFactor,
+                     sg_starty / sg_size * TN * vnniFactor,
                  N * vnniFactor);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
 }
 
 template <typename Ta, typename Tc, int vnni_factor, size_t tM, size_t tN,
-          size_t tK>
+          size_t tK, typename kernel_name>
 int init_and_multiply() {
   static constexpr size_t MATRIX_M = tM * M_MULTIPLIER;
   static constexpr size_t MATRIX_N = 128;
@@ -100,7 +103,7 @@ int init_and_multiply() {
       (Ta *)&Bvnni);
 
   matrix_multiply<Tc, Ta, MATRIX_M, MATRIX_N, MATRIX_K, vnni_factor, tM, tN,
-                  tK>(MC, MA, MBvnni);
+                  tK, kernel_name>(MC, MA, MBvnni);
   matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K);
 
   bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D);
@@ -110,23 +113,23 @@ int init_and_multiply() {
 
 int main() {
   int errors = 0;
-  errors += init_and_multiply<bfloat16, float, 2, 1, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 2, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 3, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 4, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 5, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 6, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 7, SN, 16>();
-  errors += init_and_multiply<bfloat16, float, 2, 8, SN, 16>();
-
-  errors += init_and_multiply<int8_t, int32_t, 4, 1, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 2, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 3, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 4, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 5, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 6, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 7, SN, 32>();
-  errors += init_and_multiply<int8_t, int32_t, 4, 8, SN, 32>();
+  errors += init_and_multiply<bfloat16, float, 2, 1, SN, 16, class bf16_1>();
+  errors += init_and_multiply<bfloat16, float, 2, 2, SN, 16, class bf16_2>();
+  errors += init_and_multiply<bfloat16, float, 2, 3, SN, 16, class bf16_3>();
+  errors += init_and_multiply<bfloat16, float, 2, 4, SN, 16, class bf16_4>();
+  errors += init_and_multiply<bfloat16, float, 2, 5, SN, 16, class bf16_5>();
+  errors += init_and_multiply<bfloat16, float, 2, 6, SN, 16, class bf16_6>();
+  errors += init_and_multiply<bfloat16, float, 2, 7, SN, 16, class bf16_7>();
+  errors += init_and_multiply<bfloat16, float, 2, 8, SN, 16, class bf16_8>();
+
+  errors += init_and_multiply<int8_t, int32_t, 4, 1, SN, 32, class int8_1>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 2, SN, 32, class int8_2>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 3, SN, 32, class int8_3>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 4, SN, 32, class int8_4>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 5, SN, 32, class int8_5>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 6, SN, 32, class int8_6>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 7, SN, 32, class int8_7>();
+  errors += init_and_multiply<int8_t, int32_t, 4, 8, SN, 32, class int8_8>();
 
   return errors;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
index 1ec089d0f53f2..796bdce8d0752 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
@@ -13,35 +13,41 @@ template <typename T> struct apply_add {
   void operator()(T &x) const { x = x + bfloat16(2); }
 };
 
-template <typename T, size_t M, size_t N, typename F>
-void matrix_verify_add(queue q, big_matrix<T, M, N> &A, nd_range<2> &r,
-                       const float ref, F &&lambda) {
+template <typename T, size_t M, size_t N, typename kernel_name, typename F>
+void matrix_verify_add(big_matrix<T, M, N> &A, const float ref, F &&lambda) {
   buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, N));
 
+  queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
+  nd_range<2> r({M / TM, N / TN * sg_size}, {1, 1 * sg_size});
+
   q.submit([&](handler &cgh) {
      accessor accA{bufA, cgh};
 
-     cgh.parallel_for(r, [accA, lambda](
-                             nd_item<2> spmd_item) [[intel::reqd_sub_group_size(
-                             SG_SZ)]] {
-       const auto global_idx = spmd_item.get_global_id(0);
-       const auto global_idy = spmd_item.get_global_id(1);
-       const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-       const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+     cgh.parallel_for<kernel_name>(
+         r, [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+                [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
-       sub_group sg = spmd_item.get_sub_group();
-       joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major> sub_a;
 
-       joint_matrix_fill(sg, sub_a, bfloat16(5.0));
+           joint_matrix_fill(sg, sub_a, bfloat16(5.0));
 
-       joint_matrix_apply(sg, sub_a, lambda);
+           joint_matrix_apply(sg, sub_a, lambda);
 
-       ext::intel::experimental::matrix::joint_matrix_store(
-           sg, sub_a,
-           accA.template get_multi_ptr<access::decorated::no>() +
-               (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-           N);
-     }); // parallel for
+           ext::intel::experimental::matrix::joint_matrix_store(
+               sg, sub_a,
+               accA.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N);
+         }); // parallel for
    }).wait();
   // Check if the results are correct
   {
@@ -61,14 +67,9 @@ int main() {
 
   big_matrix<bfloat16, MATRIX_M, MATRIX_N> MA((bfloat16 *)&A);
 
-  size_t NDRangeM = MATRIX_M / TM;
-  size_t NDRangeN = MATRIX_N / TN;
-  queue q;
-  nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ});
-
-  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N>(
-      q, MA, r, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); });
-  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N>(q, MA, r, 7.0,
+  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class add>(
+      MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); });
+  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class func_add>(MA, 7.0,
                                                   apply_add<bfloat16>());
   std::cout << "Passed\n";
   return 0;
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 9c12d1053d3bb..6a7b0bb369341 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -68,8 +68,9 @@ static constexpr void manually_unroll_loop(F &&f) {
 
 template <unsigned int rowsA, unsigned int colsA, unsigned int rowsB,
           unsigned int colsB, unsigned int vnniFactor, typename TOperand,
-          typename TResult, unsigned int sgSize = SG_SZ>
+          typename TResult>
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
+  size_t sgSize = get_sg_size<class MatMul>(q);
   range<2> global{rowsA / MCACHE1, (colsB / NCACHE1) * sgSize};
   range<2> cachelocal{MCACHE2 / MCACHE1, NCACHE2 / NCACHE1 * sgSize};
 
@@ -82,12 +83,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
   std::chrono::high_resolution_clock::time_point start =
       std::chrono::high_resolution_clock::now();
 
-  auto mk = q.submit([&](handler &h) {
-    h.parallel_for( // cache layer#1
+  static auto work = [&](handler &h) {
+    h.parallel_for<class MatMul>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
         // loop localrange
-        [=](nd_item<2> it) [[intel::reqd_sub_group_size(sgSize)]] {
+        [=](nd_item<2> it)
+#ifdef SG_SZ
+            [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+        {
           auto pA =
               address_space_cast<sycl::access::address_space::global_space,
                                  sycl::access::decorated::no>(A);
@@ -290,7 +295,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           } // m
 #endif
         }); // parallel_for
-  });       // queue.submit
+  }; // queue.submit
+  q.submit(work);
+
   if (i == testIterations - 1)
     q.wait();
   std::chrono::duration<double, std::milli> duration =
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
index 5be3c485312c2..bc317ffc27d31 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
@@ -23,14 +23,19 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class imatrix>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+          {
            // Matrix API has to be accessed by all the workitems in a
            // subgroup. These functions will be called once by the subgroup.
            // No code divergence between the workitems.
@@ -57,7 +62,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
                  N * 2);
 
              for (int i = 0; i < JM_ARRAY_SZ; ++i) {
@@ -75,7 +80,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                  sg, sub_c[i],
                  accC.template get_multi_ptr<access::decorated::no>() +
                      (sg_startx * TM * JM_ARRAY_SZ + TM * i) * N +
-                     sg_starty / SG_SZ * TN,
+                     sg_starty / sg_size * TN,
                  N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
index 8cb6c120d8a34..068506cc63724 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
@@ -19,15 +19,18 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -49,7 +52,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) { //
              joint_matrix_load(
@@ -60,14 +63,14 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
                  N * 2);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
index 91156c3fcc128..36ce0f81f0c63 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
@@ -17,15 +17,18 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<kernel_name>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<kernel_name>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -47,7 +50,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) { //
              joint_matrix_load(
@@ -59,14 +62,14 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
                  N * 2);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
index 3f02be1358844..54861eb3b1d3b 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
@@ -23,13 +23,18 @@ void matrix_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class copy>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::write>(cgh);
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class copy>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -46,13 +51,13 @@ void matrix_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            joint_matrix_copy(sg, sub_c, sub_a);
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_a,
                accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp
index aad8aeaa5c602..53b4ca7b97412 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp
@@ -27,56 +27,60 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<float, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class mult>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
-     cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
-             [[intel::reqd_sub_group_size(SG_SZ)]] {
-               // The submatrix API has to be accessed by all the workitems in a
-               // subgroup these functions will be called once by the subgroup
-               // no code divergence between the workitems
-               const auto global_idx = spmd_item.get_global_id(0);
-               const auto global_idy = spmd_item.get_global_id(1);
-               const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-               const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+     cgh.parallel_for<class mult>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
+           // The submatrix API has to be accessed by all the workitems in a
+           // subgroup these functions will be called once by the subgroup
+           // no code divergence between the workitems
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
-               sub_group sg = spmd_item.get_sub_group();
-               joint_matrix<sub_group, half, use::a, TM, TK, layout::row_major>
-                   sub_a;
-               // For B, we assume B has been already VNNIed.
-               joint_matrix<sub_group, half, use::b, TK, TN,
-                            layout::ext_intel_packed>
-                   sub_b;
-               joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, half, use::a, TM, TK, layout::row_major>
+               sub_a;
+           // For B, we assume B has been already VNNIed.
+           joint_matrix<sub_group, half, use::b, TK, TN,
+                        layout::ext_intel_packed>
+               sub_b;
+           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_c;
 
-               joint_matrix_load(
-                   sg, sub_c,
-                   accC.template get_multi_ptr<access::decorated::no>() +
-                       (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-                   N, layout::row_major);
-               for (int k = 0; k < K / TK; k += 1) {
-                 joint_matrix_load(
-                     sg, sub_a,
-                     accA.template get_multi_ptr<access::decorated::no>() +
-                         (sg_startx * TM) * K + k * TK,
-                     K);
-                 joint_matrix_load(
-                     sg, sub_b,
-                     accB.template get_multi_ptr<access::decorated::no>() +
-                         (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2,
-                     N * 2);
-                 joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-               }
-               joint_matrix_store(
-                   sg, sub_c,
-                   accC.template get_multi_ptr<access::decorated::no>() +
-                       (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-                   N, layout::row_major);
-             }); // parallel for
+           joint_matrix_load(
+               sg, sub_c,
+               accC.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
+           for (int k = 0; k < K / TK; k += 1) {
+             joint_matrix_load(
+                 sg, sub_a,
+                 accA.template get_multi_ptr<access::decorated::no>() +
+                     (sg_startx * TM) * K + k * TK,
+                 K);
+             joint_matrix_load(
+                 sg, sub_b,
+                 accB.template get_multi_ptr<access::decorated::no>() +
+                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
+                 N * 2);
+             joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+           }
+           joint_matrix_store(
+               sg, sub_c,
+               accC.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+               N, layout::row_major);
+         }); // parallel for
    }).wait();
 }
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp
index 96993082d8cb5..625b41f3037b8 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp
@@ -26,15 +26,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -61,14 +65,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK) * N + sg_starty / SG_SZ * TN,
+                     (k * TK) * N + sg_starty / sg_size * TN,
                  N);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
index 031c7753de425..5acc54a412096 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
@@ -15,7 +15,4 @@
 // incompatible on the current device
 
 #include "common.hpp"
-
-static constexpr size_t SN = 16;
-
 #include "joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
index a0b468120ebd3..7aba5911c8386 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
@@ -22,14 +22,19 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      sycl::accessor accC{bufC, cgh, sycl::read_write};
      sycl::accessor accA{bufA, cgh, sycl::read_only};
      sycl::accessor accB{bufB, cgh, sycl::read_only};
 
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+     cgh.parallel_for<class imatrix>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            const auto global_idx = spmd_item.get_global_id(0);
            const auto global_idy = spmd_item.get_global_id(1);
            const auto sg_startx = global_idx - spmd_item.get_local_id(0);
@@ -44,7 +49,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -56,7 +61,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
                      (k * TK / vnniFactor) * (N * vnniFactor) +
-                     sg_starty / SG_SZ * TN * vnniFactor,
+                     sg_starty / sg_size * TN * vnniFactor,
                  N * vnniFactor);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
@@ -95,8 +100,9 @@ int main() {
     init_and_multiply<bfloat16, float, 2, 1, 500,
                       16>(); // 500 is not correct size
   } catch (const sycl::exception &e) {
-    if (e.code() == errc::kernel_not_supported)
+    if (e.code() == errc::invalid)
       return 0;
+    throw;
   }
 
   return 1;
diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp
index ef67ebbd951f3..3e00c667c2505 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp
@@ -28,15 +28,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -64,14 +68,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp
index 3973a7b516bc8..f8feb25d99229 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp
@@ -28,15 +28,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -57,7 +61,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -68,14 +72,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
index 2b2fae59cd94d..536fa84581f27 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
@@ -27,15 +27,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]]
-
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item) 
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
          {
            // The matrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -56,7 +59,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K; k += TK) {
              joint_matrix_load(
@@ -67,7 +70,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k) * (N) + sg_starty / SG_SZ * TN,
+                     (k) * (N) + sg_starty / sg_size * TN,
                  N);
              // If no rounding to tf32 function is called, joint_matrix_mad
              // function will work on truncated floats.
@@ -81,7 +84,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index 624cfdb256e7d..faea43b062477 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -11,6 +11,9 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
+template <size_t TM>
+class LS;
+
 template <size_t TM, size_t TN, typename T1, size_t NUM_ROWS, size_t NUM_COLS>
 void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
                            queue q) {
@@ -22,47 +25,51 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
 
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
-
-  q.submit([&](handler &cgh) {
-     cgh.parallel_for(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
-           auto p_input =
-               address_space_cast<sycl::access::address_space::global_space,
-                                  sycl::access::decorated::no>(input);
-
-           auto p_out_col_major =
-               address_space_cast<sycl::access::address_space::global_space,
-                                  sycl::access::decorated::no>(out_col_major);
-           auto p_out_row_major =
-               address_space_cast<sycl::access::address_space::global_space,
-                                  sycl::access::decorated::no>(out_row_major);
-
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
-
-           auto row_major_offset =
-               (sg_startx * TM) * N + (sg_starty / SG_SZ * TN);
-           auto col_major_offset =
-               (sg_startx * TM) + (sg_starty / SG_SZ * TN) * M;
-
-           joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
-                             layout::col_major);
-
-           joint_matrix_store(sg, sub_matrix,
-                              p_out_col_major + row_major_offset, N,
-                              layout::row_major);
-
-           joint_matrix_store(sg, sub_matrix,
-                              p_out_row_major + col_major_offset, M,
-                              layout::col_major);
-         }); // parallel for
-   }).wait();
+  size_t sg_size = get_sg_size<class LS<TM>>(q);
+
+  static auto work = [&](handler &cgh) {
+    cgh.parallel_for<class LS<TM>>(
+        nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+        [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+            [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+        {
+          auto p_input =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(input);
+
+          auto p_out_col_major =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(out_col_major);
+          auto p_out_row_major =
+              address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no>(out_row_major);
+
+          const auto global_idx = spmd_item.get_global_id(0);
+          const auto global_idy = spmd_item.get_global_id(1);
+          const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+          const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+          sub_group sg = spmd_item.get_sub_group();
+          joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
+
+          auto row_major_offset =
+              (sg_startx * TM) * N + (sg_starty / sg_size * TN);
+          auto col_major_offset =
+              (sg_startx * TM) + (sg_starty / sg_size * TN) * M;
+
+          joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
+                            layout::col_major);
+
+          joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset,
+                             N, layout::row_major);
+
+          joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset,
+                             M, layout::col_major);
+        }); // parallel for
+  };
+  q.submit(work).wait();
 }
 
 template <size_t TM> void run_matrix_test() {
diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp
index 5441df5fe2542..db8eda82ef239 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp
@@ -28,16 +28,18 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
-
+#endif
          {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
@@ -59,7 +61,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -71,14 +73,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp
index 4dcb60f4330fc..7e7edb700debb 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp
@@ -28,15 +28,19 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
   buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
+  size_t sg_size = get_sg_size<class imatrix>(q);
   q.submit([&](handler &cgh) {
      auto accC = bufC.get_access<access::mode::read_write>(cgh);
      auto accA = bufA.get_access<access::mode::read_write>(cgh);
      auto accB = bufB.get_access<access::mode::read_write>(cgh);
 
      cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N,
-          K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
+#ifdef SG_SZ
+             [[intel::reqd_sub_group_size(SG_SZ)]]
+#endif
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
@@ -57,7 +61,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
            for (int k = 0; k < K / TK; k += 1) {
              joint_matrix_load(
@@ -69,14 +73,14 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+                     (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4,
                  N * 4);
              joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
            }
            joint_matrix_store(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
                N, layout::row_major);
          }); // parallel for
    }).wait();

From a5e15a2e5549bb6c9ce271046e867fca6e9a8c51 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 15 May 2024 15:37:33 -0700
Subject: [PATCH 22/42] clang-format

---
 sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp          | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp        | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp       | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp   | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp         | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp     | 4 ++--
 sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp             | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp       | 3 +--
 9 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
index b424a01a7c6a6..32ceaf8c730a0 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp
@@ -39,7 +39,7 @@ void matrix_sum_rows(big_matrix<T1, M, N> &C, float *sum_rows) {
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-           {
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
index 8e9880235c2b2..b48e46e18de3d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp
@@ -102,8 +102,8 @@ int init_and_multiply() {
   big_matrix<Ta, MATRIX_K / vnni_factor, MATRIX_N * vnni_factor> MBvnni(
       (Ta *)&Bvnni);
 
-  matrix_multiply<Tc, Ta, MATRIX_M, MATRIX_N, MATRIX_K, vnni_factor, tM, tN,
-                  tK, kernel_name>(MC, MA, MBvnni);
+  matrix_multiply<Tc, Ta, MATRIX_M, MATRIX_N, MATRIX_K, vnni_factor, tM, tN, tK,
+                  kernel_name>(MC, MA, MBvnni);
   matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K);
 
   bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D);
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
index 796bdce8d0752..3d3c6304952e5 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp
@@ -69,8 +69,8 @@ int main() {
 
   matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class add>(
       MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); });
-  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class func_add>(MA, 7.0,
-                                                  apply_add<bfloat16>());
+  matrix_verify_add<bfloat16, MATRIX_M, MATRIX_N, class func_add>(
+      MA, 7.0, apply_add<bfloat16>());
   std::cout << "Passed\n";
   return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
index bc317ffc27d31..9aefc370bd0c6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
@@ -31,11 +31,11 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-          {
+         {
            // Matrix API has to be accessed by all the workitems in a
            // subgroup. These functions will be called once by the subgroup.
            // No code divergence between the workitems.
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
index 068506cc63724..aef22d35f7d17 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp
@@ -27,7 +27,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
index 36ce0f81f0c63..6a7182c41985d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp
@@ -25,7 +25,7 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
 
      cgh.parallel_for<kernel_name>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
index 54861eb3b1d3b..8ac48511c7e10 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp
@@ -30,11 +30,11 @@ void matrix_copy(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A) {
 
      cgh.parallel_for<class copy>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-          {
+         {
            // The submatrix API has to be accessed by all the workitems in a
            // subgroup these functions will be called once by the subgroup no
            // code divergence between the workitems
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
index 536fa84581f27..69991884c0710 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp
@@ -35,7 +35,7 @@ void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
 
      cgh.parallel_for<class imatrix>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-         [=](nd_item<2> spmd_item) 
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
              [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index faea43b062477..5de94de6a18ba 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -11,8 +11,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-template <size_t TM>
-class LS;
+template <size_t TM> class LS;
 
 template <size_t TM, size_t TN, typename T1, size_t NUM_ROWS, size_t NUM_COLS>
 void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,

From 4cc31ddd4248a5a5558164e4dbc5a4045c075ef6 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Thu, 16 May 2024 08:15:33 -0700
Subject: [PATCH 23/42] Clean up nits

---
 sycl/test-e2e/Matrix/element_wise_abc_impl.hpp                | 1 -
 sycl/test-e2e/Matrix/get_coord_float_matC.cpp                 | 4 ----
 sycl/test-e2e/Matrix/get_coord_int8_matA.cpp                  | 4 ----
 sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp             | 3 ---
 sycl/test-e2e/Matrix/get_coord_int8_matB.cpp                  | 4 ----
 sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp               | 3 ---
 sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp                | 3 ---
 .../Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp      | 3 ---
 sycl/test-e2e/Matrix/joint_matrix_half.cpp                    | 3 ---
 .../test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp | 3 ---
 sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp               | 3 ---
 11 files changed, 34 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index dea0cf882eaaf..655fa90275f40 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -15,7 +15,6 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define TM 8
 #define TK 32
-// class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
index 57c9a00d98fd4..af7e8e1745781 100644
--- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp
@@ -11,10 +11,6 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t TN = 16;
 
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
index 67fa811f2d764..d29217577443e 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp
@@ -11,10 +11,6 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t TN = 16;
 
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
index 6f57ab5b4e63c..3f39ebf731801 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp
@@ -131,9 +131,6 @@ int main() {
 
   big_matrix<int8_t, MATRIX_M, MATRIX_K> MA((int8_t *)&A);
 
-  size_t NDRangeM = MATRIX_M / TM;
-  size_t NDRangeK = MATRIX_K / TK;
-
   for (int i = 0; i < MATRIX_M; i++) {
     for (int j = 0; j < MATRIX_K; j++) {
       A[i][j] = i + j;
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index 782df68d21ff5..ad064fd82fc0a 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -12,10 +12,6 @@
 // XFAIL: cpu
 
 #include "common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t TN = 16;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
index 0eb13cf57347c..1478914d1e44f 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp
@@ -12,9 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 // Sub-matrix N dimension
 static constexpr size_t SN = 16;
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
index 2222cbb605a15..1985bcb6a4fb9 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp
@@ -12,9 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
index 19d12915b4a95..21d5f1239cd8d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp
@@ -17,9 +17,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
index 9281e47f572d2..0bacfa93792d6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp
@@ -14,9 +14,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
index fb29cc2baaf74..37769a41f7003 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp
@@ -17,9 +17,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
index 8dcddb841721d..f592057ce94d5 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp
@@ -12,9 +12,6 @@
 
 #include "common.hpp"
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
 constexpr size_t TN = 16;
 
 #include "joint_matrix_int8_vnni_impl.hpp"

From 498fa1eba53cda1257d1eb6cb6020236b5d1b7d9 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Thu, 16 May 2024 12:22:22 -0700
Subject: [PATCH 24/42] Managed to remove the static code

---
 .../joint_matrix_bf16_fill_k_cache_impl.hpp   |  5 +-
 .../Matrix/joint_matrix_transposeC_impl.hpp   | 83 ++++++++++---------
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index 6a7b0bb369341..e389ea7137428 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -83,7 +83,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
   std::chrono::high_resolution_clock::time_point start =
       std::chrono::high_resolution_clock::now();
 
-  static auto work = [&](handler &h) {
+  q.submit([&](handler &h) {
     h.parallel_for<class MatMul>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
@@ -295,8 +295,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           } // m
 #endif
         }); // parallel_for
-  }; // queue.submit
-  q.submit(work);
+  });       // queue.submit
 
   if (i == testIterations - 1)
     q.wait();
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index 5de94de6a18ba..24ba24a264f0d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -26,49 +26,50 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
   size_t NDRangeN = N / TN;
   size_t sg_size = get_sg_size<class LS<TM>>(q);
 
-  static auto work = [&](handler &cgh) {
-    cgh.parallel_for<class LS<TM>>(
-        nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
-        [=](nd_item<2> spmd_item)
+  q.submit([&](handler &cgh) {
+     cgh.parallel_for<class LS<TM>>(
+         nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
+         [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
-            [[intel::reqd_sub_group_size(SG_SZ)]]
+             [[intel::reqd_sub_group_size(SG_SZ)]]
 #endif
-        {
-          auto p_input =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(input);
-
-          auto p_out_col_major =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(out_col_major);
-          auto p_out_row_major =
-              address_space_cast<sycl::access::address_space::global_space,
-                                 sycl::access::decorated::no>(out_row_major);
-
-          const auto global_idx = spmd_item.get_global_id(0);
-          const auto global_idy = spmd_item.get_global_id(1);
-          const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-          const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-          sub_group sg = spmd_item.get_sub_group();
-          joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
-
-          auto row_major_offset =
-              (sg_startx * TM) * N + (sg_starty / sg_size * TN);
-          auto col_major_offset =
-              (sg_startx * TM) + (sg_starty / sg_size * TN) * M;
-
-          joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
-                            layout::col_major);
-
-          joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset,
-                             N, layout::row_major);
-
-          joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset,
-                             M, layout::col_major);
-        }); // parallel for
-  };
-  q.submit(work).wait();
+         {
+           auto p_input =
+               address_space_cast<sycl::access::address_space::global_space,
+                                  sycl::access::decorated::no>(input);
+
+           auto p_out_col_major =
+               address_space_cast<sycl::access::address_space::global_space,
+                                  sycl::access::decorated::no>(out_col_major);
+           auto p_out_row_major =
+               address_space_cast<sycl::access::address_space::global_space,
+                                  sycl::access::decorated::no>(out_row_major);
+
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
+
+           auto row_major_offset =
+               (sg_startx * TM) * N + (sg_starty / sg_size * TN);
+           auto col_major_offset =
+               (sg_startx * TM) + (sg_starty / sg_size * TN) * M;
+
+           joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M,
+                             layout::col_major);
+
+           joint_matrix_store(sg, sub_matrix,
+                              p_out_col_major + row_major_offset, N,
+                              layout::row_major);
+
+           joint_matrix_store(sg, sub_matrix,
+                              p_out_row_major + col_major_offset, M,
+                              layout::col_major);
+         }); // parallel for
+   }).wait();
 }
 
 template <size_t TM> void run_matrix_test() {

From 8ab7f803d58ca589f23080fbf3621877764b92d2 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Fri, 17 May 2024 14:27:28 -0700
Subject: [PATCH 25/42] element_wise_abc

---
 .../test-e2e/Matrix/SG32/element_wise_abc.cpp |  3 +-
 .../test-e2e/Matrix/XMX8/element_wise_abc.cpp | 17 -----
 sycl/test-e2e/Matrix/element_wise_abc.cpp     |  5 +-
 .../test-e2e/Matrix/element_wise_abc_impl.hpp | 64 +++++++++++++------
 4 files changed, 46 insertions(+), 43 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
index 4833404610369..8b4cd57e4b477 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
@@ -14,6 +14,5 @@
 #include "../common.hpp"
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
deleted file mode 100644
index d7df42000249a..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-//==----------- element_wise_abc.cpp  - DPC++ joint_matrix------------- ----==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-
-#include "../element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/element_wise_abc.cpp
index 0a6a4e4abaa03..8a4e277bf6d11 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc.cpp
@@ -5,13 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "element_wise_abc_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index 655fa90275f40..e61f747f75cc0 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -13,27 +13,27 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-#define TM 8
-#define TK 32
+template <size_t M, size_t N, size_t K, int vnniFactor>
+class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
 void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
                           big_matrix<T2, K / vnniFactor, N * vnniFactor> &B) {
-  size_t NDRangeM = M / TM;
-  size_t NDRangeN = N / TN;
+  size_t NDRangeM = 1;
+  size_t NDRangeN = 1;
   buffer<T2, 2> bufA(A.get_data(), range<2>(M, K));
   buffer<T2, 2> bufB(B.get_data(), range<2>(K, N));
   buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
 
   queue q;
-  size_t sg_size = get_sg_size<class add>(q);
+  size_t sg_size = get_sg_size<add<M, N, K, vnniFactor>>(q);
   q.submit([&](handler &cgh) {
      accessor accC{bufC, cgh};
      accessor accA{bufA, cgh};
      accessor accB{bufB, cgh};
 
-     cgh.parallel_for<class add>(
+     cgh.parallel_for<add<M, N, K, vnniFactor>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -49,48 +49,72 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T2, use::a, TM, TK, layout::row_major> sub_a;
+           joint_matrix<sub_group, T2, use::a, M, K, layout::row_major> sub_a;
            // For B, we assume B has been already VNNIed.
-           joint_matrix<sub_group, T2, use::b, TK, TN, layout::ext_intel_packed>
+           joint_matrix<sub_group, T2, use::b, K, N, layout::ext_intel_packed>
                sub_b;
-           joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_c;
+           joint_matrix<sub_group, T1, use::accumulator, M, N> sub_c;
 
            joint_matrix_load(
                sg, sub_a,
                accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * K,
+                   (sg_startx * M) * K,
                K);
            joint_matrix_apply(sg, sub_a, [](T2 &x) { x += 1; });
 
            joint_matrix_load(
                sg, sub_b,
                accB.template get_multi_ptr<access::decorated::no>() +
-                   sg_starty / sg_size * TN * vnniFactor,
+                   sg_starty / sg_size * N * vnniFactor,
                N * vnniFactor);
            joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; });
 
            joint_matrix_load(
                sg, sub_c,
                accC.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / sg_size * TN,
+                   (sg_startx * M) * N + sg_starty / sg_size * N,
                N, layout::row_major);
            joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; });
          }); // parallel for
    }).wait();
 }
 
+template <typename Ta, typename Tc, size_t TM, size_t TN, size_t TK, size_t VF>
+void test() {
+  Tc A[TM][TK];
+  Tc B[TK / VF][TN * VF];
+  Ta C[TM][TN];
+
+  big_matrix<Ta, TM, TN> MC((Ta *)&C);
+  big_matrix<Tc, TM, TK> MA((Tc *)&A);
+  big_matrix<Tc, TK / VF, TN * VF> MB((Tc *)&B);
+
+  return matrix_elem_wise_ops<Ta, int8_t, TM, TN, TK, VF>(MC, MA, MB);
+}
+
 int main() {
-  static constexpr unsigned vnniFactor = 4;
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
 
-  int8_t A[TM][TK];
-  int8_t B[TK / vnniFactor][TN * vnniFactor];
-  int32_t C[TM][TN];
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<int32_t, int8_t, 16, 16, 64, 4>();
+      break;
+    }
 
-  big_matrix<int32_t, TM, TN> MC((int32_t *)&C);
-  big_matrix<int8_t, TM, TK> MA((int8_t *)&A);
-  big_matrix<int8_t, TK / vnniFactor, TN * vnniFactor> MB((int8_t *)&B);
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<int32_t, int8_t, 8, 16, 32, 4>();
+      break;
+    }
 
-  matrix_elem_wise_ops<int32_t, int8_t, TM, TN, TK, vnniFactor>(MC, MA, MB);
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<int32_t, int8_t, 8, 8, 32, 4>();
+      break;
+    }
+  }
 
   return 0;
 }

From 9049d2faaff73596554b3406702d2614b39067cb Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 07:28:43 -0700
Subject: [PATCH 26/42] WIP: element_wise_all_ops_half

---
 .../Matrix/SG32/element_wise_all_ops_half.cpp |  1 -
 .../Matrix/XMX8/element_wise_all_ops_half.cpp | 19 -----
 .../Matrix/element_wise_all_ops_half.cpp      |  3 -
 .../Matrix/element_wise_all_ops_half_impl.hpp | 78 +++++++++++++------
 4 files changed, 55 insertions(+), 46 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
index 3916aaff03867..b04fdff3c0819 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
@@ -16,6 +16,5 @@
 #include "../common.hpp"
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
deleted file mode 100644
index 826b99dfcf306..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//==----------- element_wise_all_ops_half.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: aspect-fp16
-// REQUIRES: matrix-xmx8
-// REQUIRES: matrix-fp16
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-
-#include "../element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
index c07d19ed73f2e..e60a6e720cf03 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
@@ -13,7 +13,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "element_wise_all_ops_half_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
index c252ed73eb00b..34c79256b4813 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
@@ -5,9 +5,16 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
-#define TM 8
-#define TK 16
+template <size_t TileM, size_t TileN, size_t TileK>
+class add;
+template <size_t TileM, size_t TileN, size_t TileK>
+class sub;
+template <size_t TileM, size_t TileN, size_t TileK>
+class mult;
+template <size_t TileM, size_t TileN, size_t TileK>
+class divide;
+template <size_t TileM, size_t TileN, size_t TileK>
+class logic;
 
 template <typename T, size_t M, size_t N, typename R>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> C,
@@ -60,31 +67,31 @@ void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
   assert_ops_ref<T, M, N, R>(bufA.get_host_access(read_only), ref);
 }
 
-int main() {
-
-  static constexpr size_t MATRIX_M = TM * 2;
-  static constexpr size_t MATRIX_N = TN * 2;
-  half A[MATRIX_M][MATRIX_N];
-  big_matrix<half, MATRIX_M, MATRIX_N> MA((half *)&A);
+template <typename Ta, typename Tc, size_t TM, size_t TN, size_t TK>
+void test() {
+  constexpr size_t MATRIX_M = TM * 2;
+  constexpr size_t MATRIX_N = TN * 2;
+  Ta A[MATRIX_M][MATRIX_N];
+  big_matrix<Ta, MATRIX_M, MATRIX_N> MA((Ta *)&A);
 
-  matrix_verify_op<half, MATRIX_M, MATRIX_N, TM, TN, TK, class add, float>(
-      MA, 7.0, [=](auto &x) { x = x + static_cast<half>(2); });
-  matrix_verify_op<half, MATRIX_M, MATRIX_N, TM, TN, TK, class sub, float>(
-      MA, 3.0, [=](auto &x) { x = x - static_cast<half>(2); });
-  matrix_verify_op<half, MATRIX_M, MATRIX_N, TM, TN, TK, class mult, float>(
-      MA, 15.0, [=](auto &x) { x = x * static_cast<half>(3.0); });
-  matrix_verify_op<half, MATRIX_M, MATRIX_N, TM, TN, TK, class div, float>(
-      MA, 2.5, [=](auto &x) { x = x / static_cast<half>(2.0); });
-  matrix_verify_op<half, MATRIX_M, MATRIX_N, TM, TN, TK, class logic, float>(
+  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, add<TM, TN, TK>, Tc>(
+      MA, 7.0, [=](auto &x) { x = x + static_cast<Ta>(2); });
+  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, sub<TM, TN, TK>, Tc>(
+      MA, 3.0, [=](auto &x) { x = x - static_cast<Ta>(2); });
+  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, mult<TM, TN, TK>, Tc>(
+      MA, 15.0, [=](auto &x) { x = x * static_cast<Ta>(3.0); });
+  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, divide<TM, TN, TK>, Tc>(
+      MA, 2.5, [=](auto &x) { x = x / static_cast<Ta>(2.0); });
+  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, logic<TM, TN, TK>, Tc>(
       MA, 7.0, [=](auto &x) {
         if (x) {
-          if (x > static_cast<half>(2.0) || x >= static_cast<half>(2.0) ||
-              x < static_cast<half>(2.0) || x <= static_cast<half>(2.0)) {
-            half val =
-                (x != static_cast<half>(2.0)) ? x : static_cast<half>(2.0);
+          if (x > static_cast<Ta>(2.0) || x >= static_cast<Ta>(2.0) ||
+              x < static_cast<Ta>(2.0) || x <= static_cast<Ta>(2.0)) {
+            Ta val =
+                (x != static_cast<Ta>(2.0)) ? x : static_cast<Ta>(2.0);
             val--;
             val++;
-            if (x == static_cast<half>(2.0)) {
+            if (x == static_cast<Ta>(2.0)) {
               val -= 2;
               val *= 3;
               val /= 2;
@@ -95,6 +102,31 @@ int main() {
           }
         }
       });
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<half, float, 16, 16, 32>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<half, float, 8, 16, 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<half, float, 8, 8, 16>();
+      break;
+    }
+  }
 
   return 0;
 }

From 6eae2dacd1a903d8d720dcb89001710d0d327dbd Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 21 May 2024 12:36:37 -0700
Subject: [PATCH 27/42] Expanded tests that use combos

---
 sycl/test-e2e/Matrix/element_wise_ops_impl.hpp                | 1 +
 sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp | 4 ++++
 .../test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp b/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp
index edde026ed877e..8ffbbf8eabff1 100644
--- a/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp
@@ -132,6 +132,7 @@ int main() {
       // These combination are not currently supported for subgroup size = 32 in
       // IGC
       passed &= test<bfloat16, float, 16, 16, 16, 2, class pvc_bf16_16x16x16>();
+      passed &= test<bfloat16, float, 1, 64, 16, 2, class pvc_bf16_1x64x16>();
       passed &= test<bfloat16, float, 32, 64, 16, 2, class pvc_bf16_32x64x16>();
 #endif
       break;
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp
index 1e8f58f3dc55d..9751571bcbcf5 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp
@@ -139,6 +139,10 @@ int main() {
     if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
       passed &= test<int8_t, int32_t, 8, 16, 32, class pvc_int_8x16x32>();
       passed &= test<bfloat16, float, 8, 16, 16, class pvc_bf16_8x16x16>();
+// This combination is not currently supported for sub group size = 32 in IGC
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      passed &= test<bfloat16, float, 16, 16, 16, class pvc_bf16_16x16x16>();
+#endif
       break;
     }
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp
index 44b09042b998f..754d49c354d01 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp
@@ -123,6 +123,8 @@ int main() {
                             int32_t>();
 
       if (combination.nsize == 16) { // architecture::intel_gpu_pvc
+        res += gemm_row_major<16, 16, 16, class bf16_16x16x16, bfloat16,
+                              bfloat16, float>();
         res += gemm_row_major<1, 64, 16, class bf16_1x64x16, bfloat16, bfloat16,
                               float>();
         res += gemm_row_major<32, 64, 16, class bf16_32x64x16, bfloat16,

From 2a6b455df9fbc1c90620e9d98c2eab5397510f7c Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 22 May 2024 09:18:31 -0700
Subject: [PATCH 28/42] PassedL element_wise_all_ops_int8_packed

---
 .../SG32/element_wise_all_ops_int8_packed.cpp |  1 -
 .../XMX8/element_wise_all_ops_int8_packed.cpp | 19 ----
 .../element_wise_all_ops_int8_packed.cpp      |  3 -
 .../element_wise_all_ops_int8_packed_impl.hpp | 94 +++++++++++++------
 4 files changed, 63 insertions(+), 54 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
index ad644c8734475..2d62023ad7d01 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
@@ -16,6 +16,5 @@
 #include "../common.hpp"
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
deleted file mode 100644
index 9ff39c8d516d0..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//==------ element_wise_all_ops_int8_packed.cpp  - DPC++ joint_matrix-------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// This test stores the matrix B that is VNNIed (packed).
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-
-#include "../element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
index 24f82f47e8fcd..43370673c75f7 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
@@ -13,7 +13,4 @@
 // This test stores the matrix B that is VNNIed (packed).
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "element_wise_all_ops_int8_packed_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
index 063ffe9717f83..953531927ce83 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
@@ -6,8 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define TM 8
-#define TK 32
+template <size_t TileM, size_t TileN, size_t TileK> class add;
+template <size_t TileM, size_t TileN, size_t TileK> class sub;
+template <size_t TileM, size_t TileN, size_t TileK> class mul;
+template <size_t TileM, size_t TileN, size_t TileK> class divide;
+template <size_t TileM, size_t TileN, size_t TileK> class logic;
 
 template <typename T, size_t M, size_t N, typename R>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> C, const R ref) {
@@ -61,40 +64,69 @@ void matrix_verify_op(big_matrix<T, M, N> &B, const R ref, OP op) {
   assert_ops_ref<T, M, N, R>(bufB.get_host_access(read_only), ref);
 }
 
-static constexpr size_t MATRIX_M = TM * 2;
-static constexpr size_t MATRIX_N = TN * 2;
-int8_t B[MATRIX_M][MATRIX_N];
-
-int main() {
+template <size_t TM, size_t TN, size_t TK> void test() {
+  static constexpr size_t MATRIX_M = TM * 2;
+  static constexpr size_t MATRIX_N = TN * 2;
+  int8_t B[MATRIX_M][MATRIX_N];
 
   big_matrix<int8_t, MATRIX_M, MATRIX_N> MB((int8_t *)&B);
 
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class add, int>(
-      MB, 7, [=](auto &x) { x = x + 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class sub, int>(
-      MB, 3, [=](auto &x) { x = x - 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class mul, int>(
-      MB, 10, [=](auto &x) { x = x * 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class div, int>(
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
+                   add<TM, TN, TK>, int>(MB, 7,
+                                               [=](auto &x) { x = x + 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
+                   sub<TM, TN, TK>, int>(MB, 3,
+                                               [=](auto &x) { x = x - 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
+                   mul<TM, TN, TK>, int>(MB, 10,
+                                               [=](auto &x) { x = x * 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
+                   divide<TM, TN, TK>, int>(
       MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class logic, int>(
-      MB, 7, [=](auto &x) {
-        if (x) {
-          if (x > 2 || x >= 2 || x < 2 || x <= 2) {
-            int8_t val = (x != 2) ? x : 2;
-            val--;
-            val++;
-            if (x == 2) {
-              val -= 2;
-              val *= 3;
-              val /= 2;
-            } else {
-              val += 2;
-            }
-            x = val;
-          }
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
+                   logic<TM, TN, TK>, int>(MB, 7, [=](auto &x) {
+    if (x) {
+      if (x > 2 || x >= 2 || x < 2 || x <= 2) {
+        int8_t val = (x != 2) ? x : 2;
+        val--;
+        val++;
+        if (x == 2) {
+          val -= 2;
+          val *= 3;
+          val /= 2;
+        } else {
+          val += 2;
         }
-      });
+        x = val;
+      }
+    }
+  });
+}
+
+int main() {
+
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<16, 16, 64>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<8, 16, 32>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<8, 8, 32>();
+      break;
+    }
+  }
 
   return 0;
 }

From c88783b5e1727c936c30320aa57989bd83a68ff6 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 22 May 2024 09:26:42 -0700
Subject: [PATCH 29/42] Pass: element_wise_all_ops_int8

---
 .../Matrix/SG32/element_wise_all_ops_int8.cpp |  1 -
 .../Matrix/XMX8/element_wise_all_ops_int8.cpp | 17 ----
 .../Matrix/element_wise_all_ops_int8.cpp      |  3 -
 .../Matrix/element_wise_all_ops_int8_impl.hpp | 92 ++++++++++++-------
 .../element_wise_all_ops_int8_packed_impl.hpp | 26 +++---
 5 files changed, 71 insertions(+), 68 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
index ddfa39c541c0a..fbc965df97a46 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
@@ -14,6 +14,5 @@
 #include "../common.hpp"
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
deleted file mode 100644
index a39cb6664d100..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-//==----------- element_wise_all_ops_int8.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-
-#include "../element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
index e1a2cf4eecfa1..11b488a8298ca 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
@@ -11,7 +11,4 @@
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "element_wise_all_ops_int8_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
index c5025543bfd78..b60d24f00f769 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
@@ -6,8 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define TM 8
-#define TK 32
+template <size_t TileM, size_t TileN, size_t TileK> class add;
+template <size_t TileM, size_t TileN, size_t TileK> class sub;
+template <size_t TileM, size_t TileN, size_t TileK> class mul;
+template <size_t TileM, size_t TileN, size_t TileK> class divide;
+template <size_t TileM, size_t TileN, size_t TileK> class logic;
 
 template <typename T, size_t M, size_t N, typename R>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> C, const R ref) {
@@ -59,40 +62,65 @@ void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
   assert_ops_ref<T, M, N, R>(bufA.get_host_access(read_only), ref);
 }
 
-static constexpr size_t MATRIX_M = TM * 2;
-static constexpr size_t MATRIX_N = TN * 2;
-int8_t A[MATRIX_M][MATRIX_N];
-
-int main() {
+template <size_t TM, size_t TN, size_t TK> void test() {
+  static constexpr size_t MATRIX_M = TM * 2;
+  static constexpr size_t MATRIX_N = TN * 2;
+  int8_t A[MATRIX_M][MATRIX_N];
 
   big_matrix<int8_t, MATRIX_M, MATRIX_N> MA((int8_t *)&A);
 
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class add, int>(
-      MA, 7, [=](auto &x) { x = x + 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class sub, int>(
-      MA, 3, [=](auto &x) { x = x - 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class mul, int>(
-      MA, 10, [=](auto &x) { x = x * 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class div, int>(
-      MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, class logic, int>(
-      MA, 7, [=](auto &x) {
-        if (x) {
-          if (x > 2 || x >= 2 || x < 2 || x <= 2) {
-            int8_t val = (x != 2) ? x : 2;
-            val--;
-            val++;
-            if (x == 2) {
-              val -= 2;
-              val *= 3;
-              val /= 2;
-            } else {
-              val += 2;
-            }
-            x = val;
-          }
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, add<TM, TN, TK>,
+                   int>(MA, 7, [=](auto &x) { x = x + 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, sub<TM, TN, TK>,
+                   int>(MA, 3, [=](auto &x) { x = x - 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, mul<TM, TN, TK>,
+                   int>(MA, 10, [=](auto &x) { x = x * 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, divide<TM, TN, TK>,
+                   int>(MA, 2,
+                        [=](auto &x) { x = x / 2; }); // truncation is expected
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, logic<TM, TN, TK>,
+                   int>(MA, 7, [=](auto &x) {
+    if (x) {
+      if (x > 2 || x >= 2 || x < 2 || x <= 2) {
+        int8_t val = (x != 2) ? x : 2;
+        val--;
+        val++;
+        if (x == 2) {
+          val -= 2;
+          val *= 3;
+          val /= 2;
+        } else {
+          val += 2;
         }
-      });
+        x = val;
+      }
+    }
+  });
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<16, 16, 64>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<8, 16, 32>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<8, 8, 32>();
+      break;
+    }
+  }
 
   return 0;
 }
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
index 953531927ce83..a3ed8d73bace2 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
@@ -71,20 +71,17 @@ template <size_t TM, size_t TN, size_t TK> void test() {
 
   big_matrix<int8_t, MATRIX_M, MATRIX_N> MB((int8_t *)&B);
 
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
-                   add<TM, TN, TK>, int>(MB, 7,
-                                               [=](auto &x) { x = x + 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
-                   sub<TM, TN, TK>, int>(MB, 3,
-                                               [=](auto &x) { x = x - 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
-                   mul<TM, TN, TK>, int>(MB, 10,
-                                               [=](auto &x) { x = x * 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
-                   divide<TM, TN, TK>, int>(
-      MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK,
-                   logic<TM, TN, TK>, int>(MB, 7, [=](auto &x) {
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, add<TM, TN, TK>,
+                   int>(MB, 7, [=](auto &x) { x = x + 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, sub<TM, TN, TK>,
+                   int>(MB, 3, [=](auto &x) { x = x - 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, mul<TM, TN, TK>,
+                   int>(MB, 10, [=](auto &x) { x = x * 2; });
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, divide<TM, TN, TK>,
+                   int>(MB, 2,
+                        [=](auto &x) { x = x / 2; }); // truncation is expected
+  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, logic<TM, TN, TK>,
+                   int>(MB, 7, [=](auto &x) {
     if (x) {
       if (x > 2 || x >= 2 || x < 2 || x <= 2) {
         int8_t val = (x != 2) ? x : 2;
@@ -104,7 +101,6 @@ template <size_t TM, size_t TN, size_t TK> void test() {
 }
 
 int main() {
-
   queue q;
   std::vector<combination> combinations =
       q.get_device()

From 317e3c20bef0b912539935a26de511979f51f840 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 22 May 2024 10:16:03 -0700
Subject: [PATCH 30/42] Pass: element_wise_all_sizes

---
 .../Matrix/SG32/element_wise_all_ops_int8.cpp |  2 +-
 .../Matrix/XMX8/element_wise_all_sizes.cpp    | 18 -----
 .../XMX8/element_wise_all_sizes_no_split.cpp  |  3 -
 .../Matrix/element_wise_all_sizes.cpp         |  2 +-
 .../Matrix/element_wise_all_sizes_impl.hpp    | 65 +++++++++++++------
 5 files changed, 48 insertions(+), 42 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
index fbc965df97a46..4f71059c759b7 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
deleted file mode 100644
index 5bae6a3184808..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//==----------- element_wise_all_sizes.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-
-#include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
index 87adf891cd16b..3de741d8be76f 100644
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
@@ -15,7 +15,4 @@
 // RUN: %{run} %t.out
 
 #include "../common.hpp"
-
-constexpr size_t TN = 8;
-
 #include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
index 1c07e494fcc47..ef13dcd6c640c 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
index 5800ab9c62745..e324f1fcd30af 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 static constexpr size_t M_MULTIPLIER = 16;
+template <typename T, size_t TileM, size_t TileN, size_t TileK> class add;
 
 template <typename T, size_t M, size_t N>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read_write> C,
@@ -23,7 +24,7 @@ void assert_ops_ref(host_accessor<T, 2, access::mode::read_write> C,
     }
 }
 
-template <typename T, typename T1, size_t TM, size_t TK, typename kernel_name>
+template <typename T, typename T1, size_t TM, size_t TN, size_t TK, typename kernel_name>
 void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   static constexpr size_t M = TM * M_MULTIPLIER;
   static constexpr size_t K = 128;
@@ -69,35 +70,61 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   assert_ops_ref<T, M, K>(bufA.get_host_access(), result);
 }
 
-template <typename Ta, size_t tM, size_t tK, typename kernel_name>
+template <typename Ta, size_t tM, size_t tN, size_t tK, typename kernel_name>
 void add_ref() {
   if constexpr (std::is_same_v<Ta, bfloat16>) {
     // Tests whether 5 + 2 = 7 operation is successful.
-    matrix_verify_add<bfloat16, bfloat16, tM, tK, kernel_name>(
+    matrix_verify_add<bfloat16, bfloat16, tM, tN, tK, kernel_name>(
         bfloat16(5.0), bfloat16(2.0), bfloat16(7.0));
   }
   if constexpr (std::is_same_v<Ta, int8_t>) {
-    matrix_verify_add<int8_t, int, tM, tK, kernel_name>(5 /*val1*/, 2 /*val2*/,
+    matrix_verify_add<int8_t, int, tM, tN, tK, kernel_name>(5 /*val1*/, 2 /*val2*/,
                                                         7 /*result*/);
   }
 }
 
+template <size_t TN>
+void test() {
+  add_ref<bfloat16, 1, TN, 16, add<bfloat16, 1, TN, 16>>();
+  add_ref<bfloat16, 2, TN, 16, add<bfloat16, 2, TN, 16>>();
+  add_ref<bfloat16, 3, TN, 16, add<bfloat16, 3, TN, 16>>();
+  add_ref<bfloat16, 4, TN, 16, add<bfloat16, 4, TN, 16>>();
+  add_ref<bfloat16, 5, TN, 16, add<bfloat16, 5, TN, 16>>();
+  add_ref<bfloat16, 6, TN, 16, add<bfloat16, 6, TN, 16>>();
+  add_ref<bfloat16, 7, TN, 16, add<bfloat16, 7, TN, 16>>();
+
+  add_ref<int8_t, 1, TN, 32, add<int8_t, 1, TN, 32>>();
+  add_ref<int8_t, 2, TN, 32, add<int8_t, 2, TN, 32>>();
+  add_ref<int8_t, 3, TN, 32, add<int8_t, 3, TN, 32>>();
+  add_ref<int8_t, 4, TN, 32, add<int8_t, 4, TN, 32>>();
+  add_ref<int8_t, 5, TN, 32, add<int8_t, 5, TN, 32>>();
+  add_ref<int8_t, 6, TN, 32, add<int8_t, 6, TN, 32>>();
+  add_ref<int8_t, 7, TN, 32, add<int8_t, 7, TN, 32>>();
+}
+
 int main() {
-  add_ref<bfloat16, 1 /*TM*/, 16 /*TK*/, class test_bfloat16_1>();
-  add_ref<bfloat16, 2 /*TM*/, 16 /*TK*/, class test_bfloat16_2>();
-  add_ref<bfloat16, 3 /*TM*/, 16 /*TK*/, class test_bfloat16_3>();
-  add_ref<bfloat16, 4 /*TM*/, 16 /*TK*/, class test_bfloat16_4>();
-  add_ref<bfloat16, 5 /*TM*/, 16 /*TK*/, class test_bfloat16_5>();
-  add_ref<bfloat16, 6 /*TM*/, 16 /*TK*/, class test_bfloat16_6>();
-  add_ref<bfloat16, 7 /*TM*/, 16 /*TK*/, class test_bfloat16_7>();
-
-  add_ref<int8_t, 1 /*TM*/, 32 /*TK*/, class test_int8_1>();
-  add_ref<int8_t, 2 /*TM*/, 32 /*TK*/, class test_int8_2>();
-  add_ref<int8_t, 3 /*TM*/, 32 /*TK*/, class test_int8_3>();
-  add_ref<int8_t, 4 /*TM*/, 32 /*TK*/, class test_int8_4>();
-  add_ref<int8_t, 5 /*TM*/, 32 /*TK*/, class test_int8_5>();
-  add_ref<int8_t, 6 /*TM*/, 32 /*TK*/, class test_int8_6>();
-  add_ref<int8_t, 7 /*TM*/, 32 /*TK*/, class test_int8_7>();
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<8>();
+      break;
+    }
+  }
 
   std::cout << "Passed\n";
 }

From 7d51e9cd79d6c0c9a8d6a2eca0fd93ebafdd90b1 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 22 May 2024 10:28:44 -0700
Subject: [PATCH 31/42] clang-format and nits

---
 .../Matrix/SG32/element_wise_all_ops_half.cpp  |  3 +--
 .../SG32/element_wise_all_ops_int8_packed.cpp  |  2 +-
 sycl/test-e2e/Matrix/element_wise_abc_impl.hpp |  3 +--
 .../Matrix/element_wise_all_ops_half.cpp       |  3 +--
 .../Matrix/element_wise_all_ops_half_impl.hpp  | 18 ++++++------------
 .../Matrix/element_wise_all_ops_int8.cpp       |  2 +-
 .../element_wise_all_ops_int8_packed.cpp       |  2 +-
 .../Matrix/element_wise_all_sizes_impl.hpp     | 10 +++++-----
 8 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
index b04fdff3c0819..99473c896628b 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-fp16
-// REQUIRES: matrix,gpu
-// REQUIRES: matrix-fp16
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
index 2d62023ad7d01..5f9f2809bf3ff 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
index e61f747f75cc0..5caf6d3e0a3e7 100644
--- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
@@ -13,8 +13,7 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-template <size_t M, size_t N, size_t K, int vnniFactor>
-class add;
+template <size_t M, size_t N, size_t K, int vnniFactor> class add;
 
 template <typename T1, typename T2, size_t M, size_t N, size_t K,
           int vnniFactor>
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
index e60a6e720cf03..f97241f275bd1 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-fp16
-// REQUIRES: matrix
-// REQUIRES: matrix-fp16
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
index 34c79256b4813..04e3d516a491a 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
@@ -5,16 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-template <size_t TileM, size_t TileN, size_t TileK>
-class add;
-template <size_t TileM, size_t TileN, size_t TileK>
-class sub;
-template <size_t TileM, size_t TileN, size_t TileK>
-class mult;
-template <size_t TileM, size_t TileN, size_t TileK>
-class divide;
-template <size_t TileM, size_t TileN, size_t TileK>
-class logic;
+template <size_t TileM, size_t TileN, size_t TileK> class add;
+template <size_t TileM, size_t TileN, size_t TileK> class sub;
+template <size_t TileM, size_t TileN, size_t TileK> class mult;
+template <size_t TileM, size_t TileN, size_t TileK> class divide;
+template <size_t TileM, size_t TileN, size_t TileK> class logic;
 
 template <typename T, size_t M, size_t N, typename R>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> C,
@@ -87,8 +82,7 @@ void test() {
         if (x) {
           if (x > static_cast<Ta>(2.0) || x >= static_cast<Ta>(2.0) ||
               x < static_cast<Ta>(2.0) || x <= static_cast<Ta>(2.0)) {
-            Ta val =
-                (x != static_cast<Ta>(2.0)) ? x : static_cast<Ta>(2.0);
+            Ta val = (x != static_cast<Ta>(2.0)) ? x : static_cast<Ta>(2.0);
             val--;
             val++;
             if (x == static_cast<Ta>(2.0)) {
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
index 11b488a8298ca..ba538e4ebffef 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
index 43370673c75f7..be27718279b79 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
index e324f1fcd30af..5228a154e9f6f 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp
@@ -24,7 +24,8 @@ void assert_ops_ref(host_accessor<T, 2, access::mode::read_write> C,
     }
 }
 
-template <typename T, typename T1, size_t TM, size_t TN, size_t TK, typename kernel_name>
+template <typename T, typename T1, size_t TM, size_t TN, size_t TK,
+          typename kernel_name>
 void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) {
   static constexpr size_t M = TM * M_MULTIPLIER;
   static constexpr size_t K = 128;
@@ -78,13 +79,12 @@ void add_ref() {
         bfloat16(5.0), bfloat16(2.0), bfloat16(7.0));
   }
   if constexpr (std::is_same_v<Ta, int8_t>) {
-    matrix_verify_add<int8_t, int, tM, tN, tK, kernel_name>(5 /*val1*/, 2 /*val2*/,
-                                                        7 /*result*/);
+    matrix_verify_add<int8_t, int, tM, tN, tK, kernel_name>(
+        5 /*val1*/, 2 /*val2*/, 7 /*result*/);
   }
 }
 
-template <size_t TN>
-void test() {
+template <size_t TN> void test() {
   add_ref<bfloat16, 1, TN, 16, add<bfloat16, 1, TN, 16>>();
   add_ref<bfloat16, 2, TN, 16, add<bfloat16, 2, TN, 16>>();
   add_ref<bfloat16, 3, TN, 16, add<bfloat16, 3, TN, 16>>();

From df1be4f5b874260a9557b4a4890632fac27cae16 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Wed, 22 May 2024 14:46:02 -0700
Subject: [PATCH 32/42] fixed requires

---
 sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp                | 2 ++
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp       | 4 +++-
 sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp       | 2 ++
 .../test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp | 2 ++
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
index 8b4cd57e4b477..1fdd989ae091f 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
@@ -5,6 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
 // REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
index 99473c896628b..847f4a7812aa2 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
@@ -5,8 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
 // REQUIRES: aspect-fp16
-// REQUIRES: aspect-ext_intel_matrix
+// REQUIRES: aspect-ext_intel_matrix, gpu
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
index 4f71059c759b7..984ff9a9b082f 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
@@ -5,6 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
 // REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
index 5f9f2809bf3ff..af2f4df82b648 100644
--- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
+++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp
@@ -5,6 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
 // REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 

From ced84eb054a30c49ff4bf1b43fffc782b4911059 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Thu, 23 May 2024 12:54:44 -0700
Subject: [PATCH 33/42] fixed element_wise_all_ops_int8

---
 .../Matrix/element_wise_all_ops_int8_impl.hpp | 109 +++++++++---------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
index b60d24f00f769..8c39c2f132d0a 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
@@ -6,33 +6,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-template <size_t TileM, size_t TileN, size_t TileK> class add;
-template <size_t TileM, size_t TileN, size_t TileK> class sub;
-template <size_t TileM, size_t TileN, size_t TileK> class mul;
-template <size_t TileM, size_t TileN, size_t TileK> class divide;
-template <size_t TileM, size_t TileN, size_t TileK> class logic;
+template <size_t TileRows, size_t TileCols> class add;
+template <size_t TileRows, size_t TileCols> class sub;
+template <size_t TileRows, size_t TileCols> class mul;
+template <size_t TileRows, size_t TileCols> class divide;
+template <size_t TileRows, size_t TileCols> class logic;
 
-template <typename T, size_t M, size_t N, typename R>
+template <typename T, size_t Rows, size_t Cols, typename R>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> C, const R ref) {
-  for (size_t i = 0; i < M; i++)
-    for (size_t j = 0; j < N; j++) {
+  for (size_t i = 0; i < Rows; i++)
+    for (size_t j = 0; j < Cols; j++) {
       auto diff = C[i][j] - ref;
       assert(std::fabs(static_cast<R>(diff)) <=
              std::numeric_limits<R>::epsilon());
     }
 }
 
-template <typename T, size_t M, size_t N, size_t TileM, size_t TileN,
-          size_t TileK, class kernel_name, typename R, typename OP>
-void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
-  buffer<int8_t, 2> bufA(A.get_data(), range<2>(M, N));
+template <typename T, size_t Rows, size_t Cols, size_t TileRows,
+          size_t TileCols, class kernel_name, typename R, typename OP>
+void matrix_verify_op(big_matrix<T, Rows, Cols> &A, const R ref, OP op) {
+  buffer<T, 2> bufA(A.get_data(), range<2>(Rows, Cols));
 
   queue q;
   size_t sg_size = get_sg_size<kernel_name>(q);
-  nd_range<2> r({M / TileM, N / TileN * sg_size}, {1, 1 * sg_size});
+  nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size});
 
   q.submit([&](handler &cgh) {
-     auto accA = bufA.get_access<access::mode::read_write>(cgh);
+    sycl::accessor accA{bufA, cgh, sycl::read_write};
 
      cgh.parallel_for<kernel_name>(
          r, [=](nd_item<2> spmd_item)
@@ -46,7 +46,8 @@ void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T, use::a, TileM, TileK, layout::row_major>
+           joint_matrix<sub_group, T, use::a, TileRows, TileCols,
+                        layout::row_major>
                sub_a;
 
            joint_matrix_fill(sg, sub_a, 5);
@@ -55,47 +56,47 @@ void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_a,
                accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TileM) * N + sg_starty / sg_size * TileN,
-               N);
+                   (sg_startx * TileRows) * Cols +
+                   sg_starty / sg_size * TileCols,
+               Cols);
          }); // parallel for
    }).wait();
-  assert_ops_ref<T, M, N, R>(bufA.get_host_access(read_only), ref);
+  assert_ops_ref<T, Rows, Cols, R>(bufA.get_host_access(read_only), ref);
 }
 
-template <size_t TM, size_t TN, size_t TK> void test() {
-  static constexpr size_t MATRIX_M = TM * 2;
-  static constexpr size_t MATRIX_N = TN * 2;
-  int8_t A[MATRIX_M][MATRIX_N];
-
-  big_matrix<int8_t, MATRIX_M, MATRIX_N> MA((int8_t *)&A);
-
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, add<TM, TN, TK>,
-                   int>(MA, 7, [=](auto &x) { x = x + 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, sub<TM, TN, TK>,
-                   int>(MA, 3, [=](auto &x) { x = x - 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, mul<TM, TN, TK>,
-                   int>(MA, 10, [=](auto &x) { x = x * 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, divide<TM, TN, TK>,
-                   int>(MA, 2,
-                        [=](auto &x) { x = x / 2; }); // truncation is expected
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, logic<TM, TN, TK>,
-                   int>(MA, 7, [=](auto &x) {
-    if (x) {
-      if (x > 2 || x >= 2 || x < 2 || x <= 2) {
-        int8_t val = (x != 2) ? x : 2;
-        val--;
-        val++;
-        if (x == 2) {
-          val -= 2;
-          val *= 3;
-          val /= 2;
-        } else {
-          val += 2;
+template <typename Ta, typename TResult, size_t TM, size_t TK> void test() {
+  static constexpr size_t Rows = TM * 2;
+  static constexpr size_t Cols = TK * 2;
+  Ta A[Rows][Cols];
+
+  big_matrix<Ta, Rows, Cols> MA((Ta *)&A);
+
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, add<TM, TK>, TResult>(
+      MA, 7, [=](auto &x) { x = x + 2; });
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, sub<TM, TK>, TResult>(
+      MA, 3, [=](auto &x) { x = x - 2; });
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, mul<TM, TK>, TResult>(
+      MA, 10, [=](auto &x) { x = x * 2; });
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, divide<TM, TK>, TResult>(
+      MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, logic<TM, TK>, TResult>(
+      MA, 7, [=](auto &x) {
+        if (x) {
+          if (x > 2 || x >= 2 || x < 2 || x <= 2) {
+            Ta val = (x != 2) ? x : 2;
+            val--;
+            val++;
+            if (x == 2) {
+              val -= 2;
+              val *= 3;
+              val /= 2;
+            } else {
+              val += 2;
+            }
+            x = val;
+          }
         }
-        x = val;
-      }
-    }
-  });
+      });
 }
 
 int main() {
@@ -107,17 +108,17 @@ int main() {
 
   for (unsigned int i = 0; i < combinations.size(); i++) {
     if (combinations[i].nsize == 0) { // Intel AMX
-      test<16, 16, 64>();
+      test<int8_t, int, /*TM*/ 16, /*TK*/ 64>();
       break;
     }
 
     if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<8, 16, 32>();
+      test<int8_t, int, /*TM*/ 8, /*TK*/ 32>();
       break;
     }
 
     if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<8, 8, 32>();
+      test<int8_t, int, /*TM*/ 8, /*TK*/ 32>();
       break;
     }
   }

From 8b0e59debf7835de7ed26ae06419edd90a57f938 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Mon, 27 May 2024 10:34:42 -0700
Subject: [PATCH 34/42] CPU works element_wise_all_ops_half

---
 .../Matrix/element_wise_all_ops_half.cpp      |  1 +
 .../Matrix/element_wise_all_ops_half_impl.hpp | 94 +++++++++----------
 2 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
index f97241f275bd1..148f54e44bedc 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// REQUIRES: matrix-fp16
 // REQUIRES: aspect-fp16
 // REQUIRES: aspect-ext_intel_matrix
 
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
index 04e3d516a491a..0cfedc62ff425 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
@@ -5,34 +5,33 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-template <size_t TileM, size_t TileN, size_t TileK> class add;
-template <size_t TileM, size_t TileN, size_t TileK> class sub;
-template <size_t TileM, size_t TileN, size_t TileK> class mult;
-template <size_t TileM, size_t TileN, size_t TileK> class divide;
-template <size_t TileM, size_t TileN, size_t TileK> class logic;
-
-template <typename T, size_t M, size_t N, typename R>
-void assert_ops_ref(host_accessor<T, 2, access::mode::read> C,
-                    const float ref) {
-  for (size_t i = 0; i < M; i++)
-    for (size_t j = 0; j < N; j++) {
+template <size_t TileRows, size_t TileCols> class add;
+template <size_t TileRows, size_t TileCols> class sub;
+template <size_t TileRows, size_t TileCols> class mul;
+template <size_t TileRows, size_t TileCols> class divide;
+template <size_t TileRows, size_t TileCols> class logic;
+
+template <typename T, size_t Rows, size_t Cols, typename R>
+void assert_ops_ref(host_accessor<T, 2, access::mode::read> C, const R ref) {
+  for (size_t i = 0; i < Rows; i++)
+    for (size_t j = 0; j < Cols; j++) {
       auto diff = C[i][j] - ref;
-      assert(std::fabs(static_cast<R>(diff)) <
+      assert(std::fabs(static_cast<R>(diff)) <=
              std::numeric_limits<R>::epsilon());
     }
 }
 
-template <typename T, size_t M, size_t N, size_t TileM, size_t TileN,
-          size_t TileK, class kernel_name, typename R, typename OP>
-void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
-  buffer<half, 2> bufA(A.get_data(), range<2>(M, N));
+template <typename T, size_t Rows, size_t Cols, size_t TileRows,
+          size_t TileCols, class kernel_name, typename R, typename OP>
+void matrix_verify_op(big_matrix<T, Rows, Cols> &A, const R ref, OP op) {
+  buffer<T, 2> bufA(A.get_data(), range<2>(Rows, Cols));
 
   queue q;
   size_t sg_size = get_sg_size<kernel_name>(q);
-  nd_range<2> r({M / TileM, N / TileN * sg_size}, {1, 1 * sg_size});
+  nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size});
 
   q.submit([&](handler &cgh) {
-     auto accA = bufA.get_access<access::mode::read_write>(cgh);
+     sycl::accessor accA{bufA, cgh, sycl::read_write};
 
      cgh.parallel_for<kernel_name>(
          r, [=](nd_item<2> spmd_item)
@@ -46,7 +45,8 @@ void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T, use::a, TileM, TileK, layout::row_major>
+           joint_matrix<sub_group, T, use::a, TileRows, TileCols,
+                        layout::row_major>
                sub_a;
 
            joint_matrix_fill(sg, sub_a, 5);
@@ -55,37 +55,37 @@ void matrix_verify_op(big_matrix<T, M, N> &A, const R ref, OP op) {
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_a,
                accA.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TileM) * N + sg_starty / sg_size * TileN,
-               N);
+                   (sg_startx * TileRows) * Cols +
+                   sg_starty / sg_size * TileCols,
+               Cols);
          }); // parallel for
    }).wait();
-  assert_ops_ref<T, M, N, R>(bufA.get_host_access(read_only), ref);
+  assert_ops_ref<T, Rows, Cols, R>(bufA.get_host_access(read_only), ref);
 }
 
-template <typename Ta, typename Tc, size_t TM, size_t TN, size_t TK>
-void test() {
-  constexpr size_t MATRIX_M = TM * 2;
-  constexpr size_t MATRIX_N = TN * 2;
-  Ta A[MATRIX_M][MATRIX_N];
-  big_matrix<Ta, MATRIX_M, MATRIX_N> MA((Ta *)&A);
-
-  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, add<TM, TN, TK>, Tc>(
-      MA, 7.0, [=](auto &x) { x = x + static_cast<Ta>(2); });
-  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, sub<TM, TN, TK>, Tc>(
-      MA, 3.0, [=](auto &x) { x = x - static_cast<Ta>(2); });
-  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, mult<TM, TN, TK>, Tc>(
-      MA, 15.0, [=](auto &x) { x = x * static_cast<Ta>(3.0); });
-  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, divide<TM, TN, TK>, Tc>(
-      MA, 2.5, [=](auto &x) { x = x / static_cast<Ta>(2.0); });
-  matrix_verify_op<Ta, MATRIX_M, MATRIX_N, TM, TN, TK, logic<TM, TN, TK>, Tc>(
-      MA, 7.0, [=](auto &x) {
+template <typename Ta, typename TResult, size_t TM, size_t TK> void test() {
+  static constexpr size_t Rows = TM * 2;
+  static constexpr size_t Cols = TK * 2;
+  Ta A[Rows][Cols];
+
+  big_matrix<Ta, Rows, Cols> MA((Ta *)&A);
+
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, add<TM, TK>, TResult>(
+      MA, 7, [=](auto &x) { x = x + 2; });
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, sub<TM, TK>, TResult>(
+      MA, 3, [=](auto &x) { x = x - 2; });
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, mul<TM, TK>, TResult>(
+      MA, 10, [=](auto &x) { x = x * 2; });
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, divide<TM, TK>, TResult>(
+      MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
+  matrix_verify_op<Ta, Rows, Cols, TM, TK, logic<TM, TK>, TResult>(
+      MA, 7, [=](auto &x) {
         if (x) {
-          if (x > static_cast<Ta>(2.0) || x >= static_cast<Ta>(2.0) ||
-              x < static_cast<Ta>(2.0) || x <= static_cast<Ta>(2.0)) {
-            Ta val = (x != static_cast<Ta>(2.0)) ? x : static_cast<Ta>(2.0);
+          if (x > 2 || x >= 2 || x < 2 || x <= 2) {
+            Ta val = (x != 2) ? x : 2;
             val--;
             val++;
-            if (x == static_cast<Ta>(2.0)) {
+            if (x == 2) {
               val -= 2;
               val *= 3;
               val /= 2;
@@ -107,20 +107,20 @@ int main() {
 
   for (unsigned int i = 0; i < combinations.size(); i++) {
     if (combinations[i].nsize == 0) { // Intel AMX
-      test<half, float, 16, 16, 32>();
+      test<half, float, /*TM*/ 4, /*TK*/ 4>();
       break;
     }
 
     if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<half, float, 8, 16, 16>();
+      test<half, float, /*TM*/ 8, /*TK*/ 32>();
       break;
     }
 
     if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<half, float, 8, 8, 16>();
+      test<half, float, /*TM*/ 8, /*TK*/ 32>();
       break;
     }
   }
 
   return 0;
-}
+}
\ No newline at end of file

From 925c241fbb2c309c500a2a3b40b1e0710a8aca00 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Mon, 27 May 2024 13:54:55 -0700
Subject: [PATCH 35/42] CPU pass element_wise_all_ops_int8_packed

---
 .../element_wise_all_ops_int8_packed_impl.hpp | 115 +++++++++---------
 1 file changed, 58 insertions(+), 57 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
index a3ed8d73bace2..c4a058db808de 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
@@ -6,33 +6,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-template <size_t TileM, size_t TileN, size_t TileK> class add;
-template <size_t TileM, size_t TileN, size_t TileK> class sub;
-template <size_t TileM, size_t TileN, size_t TileK> class mul;
-template <size_t TileM, size_t TileN, size_t TileK> class divide;
-template <size_t TileM, size_t TileN, size_t TileK> class logic;
+template <size_t TileRows, size_t TileCols> class add;
+template <size_t TileRows, size_t TileCols> class sub;
+template <size_t TileRows, size_t TileCols> class mul;
+template <size_t TileRows, size_t TileCols> class divide;
+template <size_t TileRows, size_t TileCols> class logic;
 
-template <typename T, size_t M, size_t N, typename R>
+template <typename T, size_t Rows, size_t Cols, typename R>
 void assert_ops_ref(host_accessor<T, 2, access::mode::read> C, const R ref) {
-  for (size_t i = 0; i < M; i++)
-    for (size_t j = 0; j < N; j++) {
-      auto diff = C[i][j] - ref;
+  for (size_t i = 0; i < Rows; i++)
+    for (size_t j = 0; j < Cols; j++) {
+      R diff = C[i][j] - ref;
       assert(std::fabs(static_cast<R>(diff)) <=
              std::numeric_limits<R>::epsilon());
     }
 }
 
-template <typename T, size_t M, size_t N, size_t TileM, size_t TileN,
-          size_t TileK, class kernel_name, typename R, typename OP>
-void matrix_verify_op(big_matrix<T, M, N> &B, const R ref, OP op) {
-  buffer<int8_t, 2> bufB(B.get_data(), range<2>(M, N));
+template <typename T, size_t Rows, size_t Cols, size_t TileRows,
+          size_t TileCols, size_t VNNI, class kernel_name, typename R,
+          typename OP>
+void matrix_verify_op(big_matrix<T, Rows, Cols> &B, const R ref, OP op) {
+  buffer<T, 2> bufB(B.get_data(), range<2>(Rows, Cols));
 
   queue q;
   size_t sg_size = get_sg_size<kernel_name>(q);
-  nd_range<2> r({M / TileM, N / TileN * sg_size}, {1, 1 * sg_size});
+  nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size});
 
   q.submit([&](handler &cgh) {
-     auto accB = bufB.get_access<access::mode::read_write>(cgh);
+     sycl::accessor accB{bufB, cgh, sycl::read_write};
 
      cgh.parallel_for<kernel_name>(
          r, [=](nd_item<2> spmd_item)
@@ -46,7 +47,7 @@ void matrix_verify_op(big_matrix<T, M, N> &B, const R ref, OP op) {
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, T, use::b, TileK, TileN,
+           joint_matrix<sub_group, T, use::b, TileRows, TileCols,
                         layout::ext_intel_packed>
                sub_b;
 
@@ -56,48 +57,48 @@ void matrix_verify_op(big_matrix<T, M, N> &B, const R ref, OP op) {
            ext::intel::experimental::matrix::joint_matrix_store(
                sg, sub_b,
                accB.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * TileM) * N * 4 +
-                   sg_starty / sg_size * TileN * 4,
-               N * 4);
+                   (sg_startx * TileRows / VNNI) * Cols * VNNI +
+                   sg_starty / sg_size * TileCols * VNNI,
+               Cols * VNNI);
          }); // parallel for
    }).wait();
-  assert_ops_ref<T, M, N, R>(bufB.get_host_access(read_only), ref);
+  assert_ops_ref<T, Rows, Cols, R>(bufB.get_host_access(read_only), ref);
 }
 
-template <size_t TM, size_t TN, size_t TK> void test() {
-  static constexpr size_t MATRIX_M = TM * 2;
-  static constexpr size_t MATRIX_N = TN * 2;
-  int8_t B[MATRIX_M][MATRIX_N];
-
-  big_matrix<int8_t, MATRIX_M, MATRIX_N> MB((int8_t *)&B);
-
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, add<TM, TN, TK>,
-                   int>(MB, 7, [=](auto &x) { x = x + 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, sub<TM, TN, TK>,
-                   int>(MB, 3, [=](auto &x) { x = x - 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, mul<TM, TN, TK>,
-                   int>(MB, 10, [=](auto &x) { x = x * 2; });
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, divide<TM, TN, TK>,
-                   int>(MB, 2,
-                        [=](auto &x) { x = x / 2; }); // truncation is expected
-  matrix_verify_op<int8_t, MATRIX_M, MATRIX_N, TM, TN, TK, logic<TM, TN, TK>,
-                   int>(MB, 7, [=](auto &x) {
-    if (x) {
-      if (x > 2 || x >= 2 || x < 2 || x <= 2) {
-        int8_t val = (x != 2) ? x : 2;
-        val--;
-        val++;
-        if (x == 2) {
-          val -= 2;
-          val *= 3;
-          val /= 2;
-        } else {
-          val += 2;
+template <typename Ta, typename TResult, size_t TK, size_t TN, size_t VNNI>
+void test() {
+  static constexpr size_t Rows = TK * 2;
+  static constexpr size_t Cols = TN * 2;
+  Ta B[Rows][Cols];
+
+  big_matrix<Ta, Rows, Cols> MB((Ta *)&B);
+
+  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, add<TK, TN>, TResult>(
+      MB, 7, [=](auto &x) { x = x + 2; });
+  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, sub<TK, TN>, TResult>(
+      MB, 3, [=](auto &x) { x = x - 2; });
+  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, mul<TK, TN>, TResult>(
+      MB, 10, [=](auto &x) { x = x * 2; });
+  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, divide<TK, TN>, TResult>(
+      MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
+  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, logic<TK, TN>, TResult>(
+      MB, 7, [=](auto &x) {
+        if (x) {
+          if (x > 2 || x >= 2 || x < 2 || x <= 2) {
+            Ta val = (x != 2) ? x : 2;
+            val--;
+            val++;
+            if (x == 2) {
+              val -= 2;
+              val *= 3;
+              val /= 2;
+            } else {
+              val += 2;
+            }
+            x = val;
+          }
         }
-        x = val;
-      }
-    }
-  });
+      });
 }
 
 int main() {
@@ -108,18 +109,18 @@ int main() {
                         matrix_combinations>();
 
   for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) { // Intel AMX
-      test<16, 16, 64>();
+    if (combinations[i].nsize == 0) {                        // Intel AMX
+      test<int8_t, int, /*TK*/ 64, /*TN*/ 16, /*VNNI*/ 4>(); // should work
       break;
     }
 
     if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<8, 16, 32>();
+      test<int8_t, int, /*TK*/ 32, /*TN*/ 16, /*VNNI*/ 4>();
       break;
     }
 
     if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<8, 8, 32>();
+      test<int8_t, int, /*TK*/ 32, /*TN*/ 8, /*VNNI*/ 4>();
       break;
     }
   }

From 894d5f8a1e8bead6b862078a3b5afa455a3aafa3 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Mon, 27 May 2024 13:58:53 -0700
Subject: [PATCH 36/42] removed XMX8/element_wise_all_sizes_no_split

---
 .../XMX8/element_wise_all_sizes_no_split.cpp   | 18 ------------------
 .../test-e2e/Matrix/element_wise_all_sizes.cpp |  3 +++
 2 files changed, 3 insertions(+), 18 deletions(-)
 delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp

diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
deleted file mode 100644
index 3de741d8be76f..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//==-------- element_wise_all_sizes_no_split.cpp  - DPC++ joint_matrix------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This is a version of element_wise_all_sizes test with disabled device code
-// split to test against fixed bug in IGC
-
-// REQUIRES: matrix-xmx8
-// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
-
-// RUN: %{build} -fsycl-device-code-split=off -o %t.out
-// RUN: %{run} %t.out
-
-#include "../common.hpp"
-#include "../element_wise_all_sizes_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
index ef13dcd6c640c..7999904ba7659 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
@@ -11,5 +11,8 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
+// RUN: %{build} -fsycl-device-code-split=off -o %t_split.out
+// RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %}
+
 #include "common.hpp"
 #include "element_wise_all_sizes_impl.hpp"

From 0a6a182bd67703827fd413243a16bcf47aa18c1b Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Mon, 27 May 2024 14:40:01 -0700
Subject: [PATCH 37/42] clang-format

---
 sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
index 8c39c2f132d0a..8a2f1f495e41d 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp
@@ -32,7 +32,7 @@ void matrix_verify_op(big_matrix<T, Rows, Cols> &A, const R ref, OP op) {
   nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size});
 
   q.submit([&](handler &cgh) {
-    sycl::accessor accA{bufA, cgh, sycl::read_write};
+     sycl::accessor accA{bufA, cgh, sycl::read_write};
 
      cgh.parallel_for<kernel_name>(
          r, [=](nd_item<2> spmd_item)

From 72f021e36063a0933969ea7eef6ab8175f40caa0 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 28 May 2024 08:03:19 -0700
Subject: [PATCH 38/42] Typo in /element_wise_all_ops_half

---
 .../Matrix/element_wise_all_ops_half_impl.hpp | 31 ++++++++++---------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
index 0cfedc62ff425..407433c0d3032 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
@@ -71,26 +71,27 @@ template <typename Ta, typename TResult, size_t TM, size_t TK> void test() {
   big_matrix<Ta, Rows, Cols> MA((Ta *)&A);
 
   matrix_verify_op<Ta, Rows, Cols, TM, TK, add<TM, TK>, TResult>(
-      MA, 7, [=](auto &x) { x = x + 2; });
+      MA, 7, [=](Ta &x) { x = x + static_cast<Ta>(2); });
   matrix_verify_op<Ta, Rows, Cols, TM, TK, sub<TM, TK>, TResult>(
-      MA, 3, [=](auto &x) { x = x - 2; });
+      MA, 3, [=](Ta &x) { x = x - static_cast<Ta>(2); });
   matrix_verify_op<Ta, Rows, Cols, TM, TK, mul<TM, TK>, TResult>(
-      MA, 10, [=](auto &x) { x = x * 2; });
+      MA, 10, [=](Ta &x) { x = x * static_cast<Ta>(2); });
   matrix_verify_op<Ta, Rows, Cols, TM, TK, divide<TM, TK>, TResult>(
-      MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
+      MA, 2.5, [=](Ta &x) { x = x / static_cast<Ta>(2); });
   matrix_verify_op<Ta, Rows, Cols, TM, TK, logic<TM, TK>, TResult>(
-      MA, 7, [=](auto &x) {
+      MA, 7, [=](Ta &x) {
         if (x) {
-          if (x > 2 || x >= 2 || x < 2 || x <= 2) {
-            Ta val = (x != 2) ? x : 2;
+          if (x > static_cast<Ta>(2) || x >= static_cast<Ta>(2) ||
+              x < static_cast<Ta>(2) || x <= static_cast<Ta>(2)) {
+            Ta val = (x != static_cast<Ta>(2)) ? x : static_cast<Ta>(2);
             val--;
             val++;
-            if (x == 2) {
-              val -= 2;
-              val *= 3;
-              val /= 2;
+            if (x == static_cast<Ta>(2)) {
+              val -= static_cast<Ta>(2);
+              val *= static_cast<Ta>(3);
+              val /= static_cast<Ta>(2);
             } else {
-              val += 2;
+              val += static_cast<Ta>(2);
             }
             x = val;
           }
@@ -107,17 +108,17 @@ int main() {
 
   for (unsigned int i = 0; i < combinations.size(); i++) {
     if (combinations[i].nsize == 0) { // Intel AMX
-      test<half, float, /*TM*/ 4, /*TK*/ 4>();
+      test<half, float, /*TM*/ 16, /*TK*/ 32>();
       break;
     }
 
     if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
-      test<half, float, /*TM*/ 8, /*TK*/ 32>();
+      test<half, float, /*TM*/ 8, /*TK*/ 16>();
       break;
     }
 
     if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
-      test<half, float, /*TM*/ 8, /*TK*/ 32>();
+      test<half, float, /*TM*/ 8, /*TK*/ 16>();
       break;
     }
   }

From 587137c67297ff3c3d065fbc7d5069c20afac770 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 28 May 2024 14:51:42 -0700
Subject: [PATCH 39/42] nits

---
 .../Matrix/element_wise_all_ops_half.cpp      |  2 +-
 .../Matrix/element_wise_all_ops_half_impl.hpp |  4 +-
 .../element_wise_all_ops_int8_packed_impl.hpp | 37 ++++++++++---------
 .../Matrix/element_wise_all_sizes.cpp         |  1 +
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
index 148f54e44bedc..bb651568f9251 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-fp16
 // REQUIRES: aspect-fp16
 // REQUIRES: aspect-ext_intel_matrix
+// REQUIRES: matrix-fp16
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
index 407433c0d3032..4065c7a78a566 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp
@@ -16,7 +16,7 @@ void assert_ops_ref(host_accessor<T, 2, access::mode::read> C, const R ref) {
   for (size_t i = 0; i < Rows; i++)
     for (size_t j = 0; j < Cols; j++) {
       auto diff = C[i][j] - ref;
-      assert(std::fabs(static_cast<R>(diff)) <=
+      assert(std::fabs(static_cast<R>(diff)) <
              std::numeric_limits<R>::epsilon());
     }
 }
@@ -124,4 +124,4 @@ int main() {
   }
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
index c4a058db808de..c6683f9657c4a 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
@@ -12,20 +12,21 @@ template <size_t TileRows, size_t TileCols> class mul;
 template <size_t TileRows, size_t TileCols> class divide;
 template <size_t TileRows, size_t TileCols> class logic;
 
-template <typename T, size_t Rows, size_t Cols, typename R>
-void assert_ops_ref(host_accessor<T, 2, access::mode::read> C, const R ref) {
+template <typename T, size_t Rows, size_t Cols, typename TResult>
+void assert_ops_ref(host_accessor<T, 2, access::mode::read> C,
+                    const TResult ref) {
   for (size_t i = 0; i < Rows; i++)
     for (size_t j = 0; j < Cols; j++) {
-      R diff = C[i][j] - ref;
-      assert(std::fabs(static_cast<R>(diff)) <=
-             std::numeric_limits<R>::epsilon());
+      TResult diff = C[i][j] - ref;
+      assert(std::fabs(static_cast<TResult>(diff)) <=
+             std::numeric_limits<TResult>::epsilon());
     }
 }
 
 template <typename T, size_t Rows, size_t Cols, size_t TileRows,
-          size_t TileCols, size_t VNNI, class kernel_name, typename R,
+          size_t TileCols, size_t VNNI, class kernel_name, typename TResult,
           typename OP>
-void matrix_verify_op(big_matrix<T, Rows, Cols> &B, const R ref, OP op) {
+void matrix_verify_op(big_matrix<T, Rows, Cols> &B, const TResult ref, OP op) {
   buffer<T, 2> bufB(B.get_data(), range<2>(Rows, Cols));
 
   queue q;
@@ -62,30 +63,30 @@ void matrix_verify_op(big_matrix<T, Rows, Cols> &B, const R ref, OP op) {
                Cols * VNNI);
          }); // parallel for
    }).wait();
-  assert_ops_ref<T, Rows, Cols, R>(bufB.get_host_access(read_only), ref);
+  assert_ops_ref<T, Rows, Cols, TResult>(bufB.get_host_access(read_only), ref);
 }
 
-template <typename Ta, typename TResult, size_t TK, size_t TN, size_t VNNI>
+template <typename T, typename TResult, size_t TK, size_t TN, size_t VNNI>
 void test() {
   static constexpr size_t Rows = TK * 2;
   static constexpr size_t Cols = TN * 2;
-  Ta B[Rows][Cols];
+  T B[Rows][Cols];
 
-  big_matrix<Ta, Rows, Cols> MB((Ta *)&B);
+  big_matrix<T, Rows, Cols> MB((T *)&B);
 
-  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, add<TK, TN>, TResult>(
+  matrix_verify_op<T, Rows, Cols, TK, TN, VNNI, add<TK, TN>, TResult>(
       MB, 7, [=](auto &x) { x = x + 2; });
-  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, sub<TK, TN>, TResult>(
+  matrix_verify_op<T, Rows, Cols, TK, TN, VNNI, sub<TK, TN>, TResult>(
       MB, 3, [=](auto &x) { x = x - 2; });
-  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, mul<TK, TN>, TResult>(
+  matrix_verify_op<T, Rows, Cols, TK, TN, VNNI, mul<TK, TN>, TResult>(
       MB, 10, [=](auto &x) { x = x * 2; });
-  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, divide<TK, TN>, TResult>(
+  matrix_verify_op<T, Rows, Cols, TK, TN, VNNI, divide<TK, TN>, TResult>(
       MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected
-  matrix_verify_op<Ta, Rows, Cols, TK, TN, VNNI, logic<TK, TN>, TResult>(
+  matrix_verify_op<T, Rows, Cols, TK, TN, VNNI, logic<TK, TN>, TResult>(
       MB, 7, [=](auto &x) {
         if (x) {
           if (x > 2 || x >= 2 || x < 2 || x <= 2) {
-            Ta val = (x != 2) ? x : 2;
+            T val = (x != 2) ? x : 2;
             val--;
             val++;
             if (x == 2) {
@@ -110,7 +111,7 @@ int main() {
 
   for (unsigned int i = 0; i < combinations.size(); i++) {
     if (combinations[i].nsize == 0) {                        // Intel AMX
-      test<int8_t, int, /*TK*/ 64, /*TN*/ 16, /*VNNI*/ 4>(); // should work
+      test<int8_t, int, /*TK*/ 64, /*TN*/ 16, /*VNNI*/ 4>();
       break;
     }
 
diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
index 7999904ba7659..5f743cfe73b3d 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
@@ -11,6 +11,7 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
+// This is a version of the test with disabled device code
 // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out
 // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %}
 

From 7cabdc732c7eb073e8a5e4fb36a73fa0d4a90299 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 28 May 2024 21:50:36 -0700
Subject: [PATCH 40/42] Update
 sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp

Co-authored-by: Yury Plyakhin <yury.plyakhin@intel.com>
---
 sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
index c6683f9657c4a..7336bb8467fa5 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
@@ -110,7 +110,7 @@ int main() {
                         matrix_combinations>();
 
   for (unsigned int i = 0; i < combinations.size(); i++) {
-    if (combinations[i].nsize == 0) {                        // Intel AMX
+    if (combinations[i].nsize == 0) { // Intel AMX
       test<int8_t, int, /*TK*/ 64, /*TN*/ 16, /*VNNI*/ 4>();
       break;
     }

From 0ed46c94abd52f9162a7b1dd41ba0c625e210529 Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 28 May 2024 21:51:20 -0700
Subject: [PATCH 41/42] Update sycl/test-e2e/Matrix/element_wise_all_sizes.cpp

Co-authored-by: Yury Plyakhin <yury.plyakhin@intel.com>
---
 sycl/test-e2e/Matrix/element_wise_all_sizes.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
index 5f743cfe73b3d..e3da4f7d130eb 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
@@ -12,6 +12,8 @@
 // RUN: %{run} %t.out
 
 // This is a version of the test with disabled device code
+// This is a version of the test with disabled device code
+// split to test against fixed bug in IGC
 // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out
 // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %}
 

From f2d2d1baab311dadd34287242d9273030468c95b Mon Sep 17 00:00:00 2001
From: Artem Radzikhovskyy <artem.radzikhovskyy@intel.com>
Date: Tue, 28 May 2024 21:52:45 -0700
Subject: [PATCH 42/42] Update sycl/test-e2e/Matrix/element_wise_all_sizes.cpp

---
 sycl/test-e2e/Matrix/element_wise_all_sizes.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
index e3da4f7d130eb..22ec9b98a66c1 100644
--- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
+++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
@@ -11,7 +11,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// This is a version of the test with disabled device code
 // This is a version of the test with disabled device code
 // split to test against fixed bug in IGC
 // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out