From ad8a477e8928c568cd6e3ed21f1ecf465c9a294f Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 14 May 2024 14:13:38 -0700 Subject: [PATCH 01/42] SG32 #define SG_SZ --- sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp | 2 +- sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp | 2 +- sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp | 2 +- sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp | 2 +- .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 +- .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp | 2 +- .../Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp | 2 +- .../Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp index 182ec8e81233d..4833404610369 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp index 7b9655fe62416..3916aaff03867 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp index e88f0a0a135f5..ddfa39c541c0a 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp index 8a91d404f6948..ad644c8734475 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp index 06c1f5d3f5c96..06d459a2a3ce5 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp index 4824ff2568d30..4624110577ea2 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp @@ -13,6 +13,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp index 3bdd2ed83b08d..9d38fb7afa30d 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp @@ -17,7 +17,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp index 79383fce4b7fc..13d8df56f40a1 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp @@ -17,7 +17,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp index 275e22d5b509f..77949b4eab6d9 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp @@ -19,7 +19,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp index b9660e73e3ab2..46de02fe8f525 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp @@ -17,7 +17,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 // Sub-matrix N dimension static constexpr size_t SN = 16; diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp index 0ce3d22bc873b..c38d8f133264d 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp index 7040058dc8554..b93985f8e594e 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp @@ -19,7 +19,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index 3a023df7b10f8..10391f2e7e319 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -16,7 +16,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp index 1b7a8ed351139..994a2217d681f 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -21,7 +21,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 8c52421657229..4f7e3638daaf3 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -18,7 +18,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp index fc7d0c9e4eba2..2ea58e9953917 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp @@ -20,7 +20,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; using bfloat16 = sycl::ext::oneapi::bfloat16; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 06798015261e7..6532bcfe47bff 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -25,7 +25,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; using bfloat16 = sycl::ext::oneapi::bfloat16; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp index e2158368ff6f8..70e53441cb48f 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp index 52d8bc9c6f4a4..b474f846d11d5 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp @@ -13,6 +13,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 #include "../joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp index cb7b15819f2bb..f4dd217655439 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp @@ -18,7 +18,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp index d7289579098e9..c89c657c77fbc 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -21,7 +21,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp index 09c4d6059750c..c8ee58e126732 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp @@ -18,7 +18,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_int8_vnni_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp index ed7fb96ca104a..1848a480a0eb7 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; constexpr size_t MATRIX_K = 1024 + 24; diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp index 6b059ed357781..b193d422c2b8c 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp index 5a13d4c1f1807..cfd89fcb8a1bf 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp index 9a82aa8bb647a..18da250bc808d 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp index 504e7beac85e3..214dd10f5158f 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp index 3532e5cc4e3ba..f4b2426af93a8 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; static constexpr size_t MATRIX_K = 1024 + 14; diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp index a4292269811f1..aec91f70bd1d7 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp index 842977311cafa..b2d6510622736 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_uu_int8_impl.hpp" From cd17776673411bbd8013bb41472e3640b4d97cb0 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 14 May 2024 14:19:53 -0700 Subject: [PATCH 02/42] XMX8 no SG_SZ --- sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp | 1 - sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp | 1 - sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp | 1 - sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp | 1 - .../test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp | 1 - .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp | 1 - .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp | 1 - 29 files changed, 29 deletions(-) diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp index aa2d2e28ac468..d7df42000249a 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp index f360bdbba6ada..826b99dfcf306 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp index 6f3aedfe506d5..a39cb6664d100 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp index ca425f7ded5d1..9ff39c8d516d0 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp index b9d49bba70abb..5bae6a3184808 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp @@ -13,7 +13,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp index 2975ab9edf6c4..87adf891cd16b 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp @@ -16,7 +16,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp index 5aa1cd8a2a0d7..d86af51e3cd86 100644 --- a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp @@ -18,7 +18,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp index ece88423d0f43..e815b46e1ed21 100644 --- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp @@ -18,7 +18,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp index a84580c3f846c..4c4d6c6eb5765 100644 --- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp index be1ac0f24e88c..32b8c3bc6e24f 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t SN = 8; #include "../joint_matrix_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp index f02028d31e7ed..614a67db9ff8a 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp index b52e8085be172..fbcd21be62f75 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp @@ -15,7 +15,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp index 2e05e656e5379..c5e399bc98f48 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp @@ -15,7 +15,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp index 18238e4896ccb..ba24ea0dfc4b8 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -17,7 +17,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 49b5e6eebb4ac..9d88c89c50f41 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -17,7 +17,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp index 008db77761e3d..173ac16a42afc 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp index b72e2ed83841c..5a41f19bc2ac1 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bfloat16_32x64_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp index e6371806f3592..09c1a4ae32a92 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 static constexpr int TN = 8; #include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp index 494a84c173edb..7d74bf8055d6b 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp index dbe060711b02a..419cc936f14e4 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp index 728a057aedaa7..3dadaeebee511 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_int8_vnni_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp index 532af4dc5d844..07a48bd44fccd 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp @@ -8,7 +8,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t SN = 8; #include "../joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp index 944cccd310d3e..0ba69032465b9 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; static constexpr size_t MATRIX_K = 1024 + 24; diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp index 4a3770be74f91..fbd97d215498d 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp index d5c7a74c20aff..2694d0135c6a1 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp index 672e8b87e22e6..a0a98e3f16d0c 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp @@ -13,7 +13,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp index aa8e00c08b658..f42f37378514d 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; constexpr size_t MATRIX_K = 1024 + 14; diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp index 56feaaec924ad..0c5f46f6fcec6 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp index a1643332e489f..bc08632463f22 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_uu_int8_impl.hpp" From ae00144c0d1de9f66de6e8132e7a37f10b6227a0 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 15 May 2024 07:39:06 -0700 Subject: [PATCH 03/42] WIP abc_impl: remove SG_SZ --- sycl/test-e2e/Matrix/element_wise_abc.cpp | 1 - sycl/test-e2e/Matrix/element_wise_abc_impl.hpp | 16 ++++++++++++---- .../Matrix/element_wise_all_ops_half.cpp | 1 - .../Matrix/element_wise_all_ops_int8.cpp | 1 - .../Matrix/element_wise_all_ops_int8_packed.cpp | 1 - .../Matrix/element_wise_all_ops_tf32.cpp | 1 - sycl/test-e2e/Matrix/element_wise_all_sizes.cpp | 3 --- .../Matrix/elemwise_irreg_size_ops_bf16.cpp | 2 -- sycl/test-e2e/Matrix/get_coord_float_matC.cpp | 1 - sycl/test-e2e/Matrix/get_coord_int8_matA.cpp | 1 - sycl/test-e2e/Matrix/get_coord_int8_matB.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp | 1 - .../Matrix/joint_matrix_annotated_ptr.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp | 1 - .../Matrix/joint_matrix_bf16_fill_k_cache.cpp | 1 - .../joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 1 - ...oint_matrix_bf16_fill_k_cache_unroll_init.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp | 1 - .../Matrix/joint_matrix_bfloat16_array.cpp | 2 +- ...joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 1 - .../Matrix/joint_matrix_bfloat16_packedB.cpp | 2 -- .../Matrix/joint_matrix_colA_rowB_colC.cpp | 1 - .../Matrix/joint_matrix_down_convert.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_half.cpp | 1 - .../joint_matrix_int8_colmajorA_colmajorB.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp | 1 - .../Matrix/joint_matrix_opt_kernel_feature.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp | 1 - .../Matrix/joint_matrix_rowmajorA_rowmajorB.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_tf32.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp | 1 - .../test-e2e/Matrix/joint_matrix_unaligned_k.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp | 4 ---- 38 files changed, 14 insertions(+), 65 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/element_wise_abc.cpp index c9954fee4f898..0a6a4e4abaa03 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/element_wise_abc.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index bf8b2ecb4df85..8c08bfad7a867 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -15,6 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define TM 8 #define TK 32 +class add; template @@ -27,14 +28,21 @@ void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + std::cout << "Artem: before get_sg_size()\n"; + size_t sg_size = get_sg_size(q); + std::cout << "Artem: after get_sg_size()\n"; q.submit([&](handler &cgh) { accessor accC{bufC, cgh}; accessor accA{bufA, cgh}; accessor accB{bufB, cgh}; cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -60,14 +68,14 @@ void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - sg_starty / SG_SZ * TN * vnniFactor, + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; }); joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; }); }); // parallel for diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp index fae692ff39ed9..c07d19ed73f2e 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp index 93ddcefc19ac3..e1a2cf4eecfa1 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp index 2d79d945e8980..24f82f47e8fcd 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp index 28483b5c2092e..6e2f8dcff6384 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp @@ -13,7 +13,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp index 661027237f836..1c07e494fcc47 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp @@ -12,7 +12,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -#define SG_SZ 16 - #include "element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp index a2b8ef5aa8b57..7ad89965f5243 100644 --- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp +++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp @@ -21,8 +21,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; using bfloat16 = sycl::ext::oneapi::bfloat16; -#define SG_SZ 16 - // 10x12 is not multiply the sg size, slicing implementation will have to insert // padding #define TM 10 diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp index 78a6f815df19c..57c9a00d98fd4 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp @@ -16,7 +16,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp index 6500a34f48119..67fa811f2d764 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp @@ -16,7 +16,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp index 798afde072dd3..fe87e9a911b7b 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp index 408a6087206ea..0eb13cf57347c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 // Sub-matrix N dimension static constexpr size_t SN = 16; diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp index 896cbef04cff0..69c9eb31a4b9c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp @@ -15,7 +15,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_annotated_ptr_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp index 82ad15285a4fa..d58677fa2c178 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp index 0c93876db2a15..abee7d7259f28 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp @@ -18,7 +18,6 @@ #include "common.hpp" #include -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp index 7206cb165349b..d839f3db8f481 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -14,7 +14,7 @@ #include "common.hpp" #include -#define SG_SZ 16 + constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp index 5518d9cb08fbc..1800901e24111 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -20,7 +20,6 @@ #include "common.hpp" #include -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index a393f3a2ad729..701c17741f576 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -17,7 +17,6 @@ #include "common.hpp" #include -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp index d1410ac68276e..2222cbb605a15 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp index 80e1f310ce440..98ed155b297ad 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp @@ -11,7 +11,7 @@ // RUN: %{run} %t.out #include "common.hpp" -#define SG_SZ 16 + static constexpr int TN = 16; #include "joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 9cd31a8c5178e..19d12915b4a95 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -20,7 +20,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp index 3e80168752545..0d592e04b606c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp @@ -12,6 +12,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -#define SG_SZ 16 #include "joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp index 7d114175dff13..354a71006e129 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp index caea640677aa7..dee504c22e7f6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp @@ -11,7 +11,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -constexpr size_t SG_SZ = 16; - #include "joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp index ac09361a0799c..9281e47f572d2 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp index 33c00022a5a76..fb29cc2baaf74 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -20,7 +20,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp index 02813c6720deb..8dcddb841721d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_int8_vnni_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp index 6195ee2935892..031c7753de425 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp @@ -16,7 +16,6 @@ #include "common.hpp" -#define SG_SZ 16 static constexpr size_t SN = 16; #include "joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp index 854d3ccc85dce..d11a6498ca7dd 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; constexpr size_t MATRIX_K = 1024 + 24; diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp index 30d9278e07157..7abea83c6d287 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp @@ -13,6 +13,5 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp index 958bd94fe0cd3..77df6085bc09a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp @@ -16,8 +16,4 @@ // transform. This is currently only available on AMX and XMX of PVC #include "common.hpp" - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - #include "joint_matrix_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp index e487b8cdcb41d..2089e0185b0e0 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp index 72910c4ed5446..7a02d03b9d642 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp index 6f34a4acbea61..922b79f356e78 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp @@ -13,10 +13,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp index f98c8bd3c7b48..bd04b157cf667 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp index 212ac34a3a640..e1cf6cb6cf8bb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; static constexpr size_t MATRIX_K = 1024 + 14; diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp index 409b589904847..f4237b995aad8 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp index 59a47484a335c..a75d18b9e6967 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_uu_int8_impl.hpp" From d26787405f0a3ee8334a1b0a8474c16897e86340 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 15 May 2024 15:21:26 -0700 Subject: [PATCH 04/42] Made tests independant of SG_SZ --- .../XMX8/joint_matrix_opt_kernel_feature.cpp | 3 - .../test-e2e/Matrix/element_wise_abc_impl.hpp | 8 +- .../Matrix/element_wise_all_sizes_impl.hpp | 54 ++++++----- .../Matrix/get_coord_float_matC_impl.hpp | 15 ++-- .../Matrix/get_coord_int8_matA_impl.hpp | 73 ++++++++------- .../Matrix/joint_matrix_all_sizes_impl.hpp | 57 ++++++------ .../Matrix/joint_matrix_apply_bf16_impl.hpp | 57 ++++++------ .../joint_matrix_bf16_fill_k_cache_impl.hpp | 23 +++-- .../joint_matrix_bfloat16_array_impl.hpp | 15 ++-- .../Matrix/joint_matrix_bfloat16_impl.hpp | 15 ++-- .../joint_matrix_bfloat16_packedB_impl.hpp | 15 ++-- .../Matrix/joint_matrix_down_convert_impl.hpp | 15 ++-- .../Matrix/joint_matrix_half_impl.hpp | 90 ++++++++++--------- .../Matrix/joint_matrix_int8_vnni_impl.hpp | 14 +-- .../joint_matrix_opt_kernel_feature.cpp | 3 - .../joint_matrix_opt_kernel_feature_impl.hpp | 18 ++-- .../Matrix/joint_matrix_ss_int8_impl.hpp | 14 +-- .../Matrix/joint_matrix_su_int8_impl.hpp | 16 ++-- .../Matrix/joint_matrix_tf32_impl.hpp | 15 ++-- .../Matrix/joint_matrix_transposeC_impl.hpp | 89 +++++++++--------- .../Matrix/joint_matrix_us_int8_impl.hpp | 14 +-- .../Matrix/joint_matrix_uu_int8_impl.hpp | 16 ++-- 22 files changed, 357 insertions(+), 282 deletions(-) diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp index 07a48bd44fccd..30b3522ad2442 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp @@ -7,7 +7,4 @@ // incompatible on the current device #include "../common.hpp" - -constexpr size_t SN = 8; - #include "../joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index 8c08bfad7a867..dea0cf882eaaf 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -15,7 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define TM 8 #define TK 32 -class add; +// class add; template @@ -28,15 +28,13 @@ void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; - std::cout << "Artem: before get_sg_size()\n"; - size_t sg_size = get_sg_size(q); - std::cout << "Artem: after get_sg_size()\n"; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { accessor accC{bufC, cgh}; accessor accA{bufA, cgh}; accessor accB{bufB, cgh}; - cgh.parallel_for( + cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), [=](nd_item<2> spmd_item) #ifdef SG_SZ diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp index 4020e8b84bbd2..5800ab9c62745 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp @@ -23,7 +23,7 @@ void assert_ops_ref(host_accessor C, } } -template +template void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { static constexpr size_t M = TM * M_MULTIPLIER; static constexpr size_t K = 128; @@ -32,7 +32,8 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { size_t NDRangeM = M / TM; size_t NDRangeK = K / TK; queue q; - nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ}); + size_t sg_size = get_sg_size(q); + nd_range<2> r({NDRangeM, NDRangeK * sg_size}, {1, 1 * sg_size}); big_matrix A((T *)&MatA); buffer bufA(A.get_data(), range<2>(M, K)); @@ -40,8 +41,12 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { q.submit([&](handler &cgh) { sycl::accessor accA{bufA, cgh, sycl::read_write}; - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); @@ -57,41 +62,42 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { ext::intel::experimental::matrix::joint_matrix_store( sg, sub_a, accA.template get_multi_ptr() + - (sg_startx * TM) * K + sg_starty / SG_SZ * TK, + (sg_startx * TM) * K + sg_starty / sg_size * TK, K); }); // parallel for }).wait(); assert_ops_ref(bufA.get_host_access(), result); } -template void add_ref() { +template +void add_ref() { if constexpr (std::is_same_v) { // Tests whether 5 + 2 = 7 operation is successful. - matrix_verify_add(bfloat16(5.0), bfloat16(2.0), - bfloat16(7.0)); + matrix_verify_add( + bfloat16(5.0), bfloat16(2.0), bfloat16(7.0)); } if constexpr (std::is_same_v) { - matrix_verify_add(5 /*val1*/, 2 /*val2*/, - 7 /*result*/); + matrix_verify_add(5 /*val1*/, 2 /*val2*/, + 7 /*result*/); } } int main() { - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); std::cout << "Passed\n"; } diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp index bedc91bdc39d4..b424a01a7c6a6 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp @@ -28,13 +28,18 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { buffer sum_rows_v(sum_rows, M); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto v = sum_rows_v.get_access(cgh); - cgh.parallel_for( - nd_range<2>({M / TM, N / TN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({M / TM, N / TN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -49,7 +54,7 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); float sum_local_rows[M] = {0}; @@ -62,7 +67,7 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { sum_local_rows[i] = reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); // only Groups leader perform the global reduction - if (global_idy % SG_SZ == 0) { + if (global_idy % sg_size == 0) { sycl::atomic_ref aref(v[i]); diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp index afda0f90a6e37..6f57ab5b4e63c 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp @@ -72,45 +72,54 @@ W0 --> 0 0 1 1 2 2 3 3 .... 7 7 // clang-format on template -void matrix_sum_rows(queue q, big_matrix &A, nd_range<2> &r) { +void matrix_sum_rows(big_matrix &A) { buffer bufA(A.get_data(), range<2>(M, K)); // size of vector is equal to number of rows in big matrix int sum_rows[M] = {0}; buffer sum_rows_v(sum_rows, M); + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({M / TM, K / TK * sg_size}, {1, 1 * sg_size}); q.submit([&](handler &cgh) { auto accA = bufA.get_access(cgh); auto v = sum_rows_v.get_access(cgh); - cgh.parallel_for(r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size( - SG_SZ)]] { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix_load(sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM * K) + sg_starty / SG_SZ * TK, - K); - - int32_t sum_local_rows[M] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_a, [&](int8_t &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TM] += x; - }); - for (int i = 0; i < M; ++i) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - - // only Groups leader performs the global reduction - if (global_idy % SG_SZ == 0) - atomic_fetch_add(v[i], sum_local_rows[i]); - } - }); // parallel for + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM * K) + sg_starty / sg_size * TK, + K); + + int32_t sum_local_rows[M] = {0}; + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub_a, [&](int8_t &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * TM] += x; + }); + for (int i = 0; i < M; ++i) { + sum_local_rows[i] = + reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); + + // only Groups leader performs the global reduction + if (global_idy % sg_size == 0) + atomic_fetch_add(v[i], sum_local_rows[i]); + } + }); // parallel for }).wait(); sum_rows_ref(bufA.get_host_access(), sum_rows_v.get_host_access()); } @@ -124,8 +133,6 @@ int main() { size_t NDRangeM = MATRIX_M / TM; size_t NDRangeK = MATRIX_K / TK; - queue q; - nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ}); for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_K; j++) { @@ -133,7 +140,7 @@ int main() { } } - matrix_sum_rows(q, MA, r); + matrix_sum_rows(MA); std::cout << "Passed\n"; return 0; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp index edfcfe1d2e979..8e9880235c2b2 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp @@ -9,7 +9,7 @@ static constexpr size_t M_MULTIPLIER = 16; template + int vnniFactor, size_t TM, size_t TN, size_t TK, typename kernel_name> void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { size_t NDRangeM = M / TM; @@ -19,15 +19,18 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { sycl::accessor accC{bufC, cgh, sycl::read_write}; sycl::accessor accA{bufA, cgh, sycl::read_only}; sycl::accessor accB{bufB, cgh, sycl::read_only}; - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -47,7 +50,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -59,21 +62,21 @@ void matrix_multiply(big_matrix &C, big_matrix &A, sg, sub_b, accB.template get_multi_ptr() + (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / SG_SZ * TN * vnniFactor, + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); } template + size_t tK, typename kernel_name> int init_and_multiply() { static constexpr size_t MATRIX_M = tM * M_MULTIPLIER; static constexpr size_t MATRIX_N = 128; @@ -100,7 +103,7 @@ int init_and_multiply() { (Ta *)&Bvnni); matrix_multiply(MC, MA, MBvnni); + tK, kernel_name>(MC, MA, MBvnni); matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K); bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D); @@ -110,23 +113,23 @@ int init_and_multiply() { int main() { int errors = 0; - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); return errors; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp index 1ec089d0f53f2..796bdce8d0752 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp @@ -13,35 +13,41 @@ template struct apply_add { void operator()(T &x) const { x = x + bfloat16(2); } }; -template -void matrix_verify_add(queue q, big_matrix &A, nd_range<2> &r, - const float ref, F &&lambda) { +template +void matrix_verify_add(big_matrix &A, const float ref, F &&lambda) { buffer bufA(A.get_data(), range<2>(M, N)); + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({M / TM, N / TN * sg_size}, {1, 1 * sg_size}); + q.submit([&](handler &cgh) { accessor accA{bufA, cgh}; - cgh.parallel_for(r, [accA, lambda]( - nd_item<2> spmd_item) [[intel::reqd_sub_group_size( - SG_SZ)]] { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; - joint_matrix_fill(sg, sub_a, bfloat16(5.0)); + joint_matrix_fill(sg, sub_a, bfloat16(5.0)); - joint_matrix_apply(sg, sub_a, lambda); + joint_matrix_apply(sg, sub_a, lambda); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N); - }); // parallel for + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N); + }); // parallel for }).wait(); // Check if the results are correct { @@ -61,14 +67,9 @@ int main() { big_matrix MA((bfloat16 *)&A); - size_t NDRangeM = MATRIX_M / TM; - size_t NDRangeN = MATRIX_N / TN; - queue q; - nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); - - matrix_verify_add( - q, MA, r, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); }); - matrix_verify_add(q, MA, r, 7.0, + matrix_verify_add( + MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); }); + matrix_verify_add(MA, 7.0, apply_add()); std::cout << "Passed\n"; return 0; diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 1c29f866f134c..36cfb5ea1f069 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -68,8 +68,9 @@ static constexpr void manually_unroll_loop(F &&f) { template + typename TResult> double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { + size_t sgSize = get_sg_size(q); range<2> global{rowsA / MCACHE1, (colsB / NCACHE1) * sgSize}; range<2> cachelocal{MCACHE2 / MCACHE1, NCACHE2 / NCACHE1 * sgSize}; @@ -82,12 +83,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); - auto mk = q.submit([&](handler &h) { - h.parallel_for( // cache layer#1 + static auto work = [&](handler &h) { + h.parallel_for( // cache layer#1 nd_range<2>{global, cachelocal}, // loop global // loop localrange - [=](nd_item<2> it) [[intel::reqd_sub_group_size(sgSize)]] { + [=](nd_item<2> it) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { auto pA = address_space_cast(A); @@ -243,8 +248,8 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { }); // for k1 #else } // n - } // m - } // k1 + } // m + } // k1 #endif } // for k2 #ifdef MANUAL_UNROLL @@ -267,10 +272,12 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { }); // m #else } // n - } // m + } // m #endif }); // parallel_for - }); // queue.submit + }; // queue.submit + q.submit(work); + if (i == testIterations - 1) q.wait(); std::chrono::duration duration = diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp index 5be3c485312c2..bc317ffc27d31 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp @@ -23,14 +23,19 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // Matrix API has to be accessed by all the workitems in a // subgroup. These functions will be called once by the subgroup. // No code divergence between the workitems. @@ -57,7 +62,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, N * 2); for (int i = 0; i < JM_ARRAY_SZ; ++i) { @@ -75,7 +80,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, sg, sub_c[i], accC.template get_multi_ptr() + (sg_startx * TM * JM_ARRAY_SZ + TM * i) * N + - sg_starty / SG_SZ * TN, + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp index 8cb6c120d8a34..068506cc63724 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp @@ -19,15 +19,18 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -49,7 +52,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { // joint_matrix_load( @@ -60,14 +63,14 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, N * 2); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp index 91156c3fcc128..36ce0f81f0c63 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp @@ -17,15 +17,18 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -47,7 +50,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { // joint_matrix_load( @@ -59,14 +62,14 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, N * 2); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp index 3f02be1358844..54861eb3b1d3b 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp @@ -23,13 +23,18 @@ void matrix_copy(big_matrix &C, big_matrix &A) { buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -46,13 +51,13 @@ void matrix_copy(big_matrix &C, big_matrix &A) { joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); joint_matrix_copy(sg, sub_c, sub_a); ext::intel::experimental::matrix::joint_matrix_store( sg, sub_a, accA.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp index aad8aeaa5c602..53b4ca7b97412 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp @@ -27,56 +27,60 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - [[intel::reqd_sub_group_size(SG_SZ)]] { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup - // no code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup + // no code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, - N * 2); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - }); // parallel for + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, + N * 2); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for }).wait(); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp index 96993082d8cb5..625b41f3037b8 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp @@ -26,15 +26,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -61,14 +65,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK) * N + sg_starty / SG_SZ * TN, + (k * TK) * N + sg_starty / sg_size * TN, N); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp index 031c7753de425..5acc54a412096 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp @@ -15,7 +15,4 @@ // incompatible on the current device #include "common.hpp" - -static constexpr size_t SN = 16; - #include "joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp index a0b468120ebd3..7aba5911c8386 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp @@ -22,14 +22,19 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { sycl::accessor accC{bufC, cgh, sycl::read_write}; sycl::accessor accA{bufA, cgh, sycl::read_only}; sycl::accessor accB{bufB, cgh, sycl::read_only}; - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); @@ -44,7 +49,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -56,7 +61,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, sg, sub_b, accB.template get_multi_ptr() + (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / SG_SZ * TN * vnniFactor, + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } @@ -95,8 +100,9 @@ int main() { init_and_multiply(); // 500 is not correct size } catch (const sycl::exception &e) { - if (e.code() == errc::kernel_not_supported) + if (e.code() == errc::invalid) return 0; + throw; } return 1; diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp index ef67ebbd951f3..3e00c667c2505 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp @@ -28,15 +28,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -64,14 +68,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp index 3973a7b516bc8..f8feb25d99229 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp @@ -28,15 +28,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -57,7 +61,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -68,14 +72,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp index 2b2fae59cd94d..536fa84581f27 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp @@ -27,15 +27,18 @@ void matrix_multiply(big_matrix &C, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The matrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -56,7 +59,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K; k += TK) { joint_matrix_load( @@ -67,7 +70,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k) * (N) + sg_starty / SG_SZ * TN, + (k) * (N) + sg_starty / sg_size * TN, N); // If no rounding to tf32 function is called, joint_matrix_mad // function will work on truncated floats. @@ -81,7 +84,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index 624cfdb256e7d..faea43b062477 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -11,6 +11,9 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; +template +class LS; + template void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, queue q) { @@ -22,47 +25,51 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - - q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { - auto p_input = - address_space_cast(input); - - auto p_out_col_major = - address_space_cast(out_col_major); - auto p_out_row_major = - address_space_cast(out_row_major); - - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_matrix; - - auto row_major_offset = - (sg_startx * TM) * N + (sg_starty / SG_SZ * TN); - auto col_major_offset = - (sg_startx * TM) + (sg_starty / SG_SZ * TN) * M; - - joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, - layout::col_major); - - joint_matrix_store(sg, sub_matrix, - p_out_col_major + row_major_offset, N, - layout::row_major); - - joint_matrix_store(sg, sub_matrix, - p_out_row_major + col_major_offset, M, - layout::col_major); - }); // parallel for - }).wait(); + size_t sg_size = get_sg_size>(q); + + static auto work = [&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto p_input = + address_space_cast(input); + + auto p_out_col_major = + address_space_cast(out_col_major); + auto p_out_row_major = + address_space_cast(out_row_major); + + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_matrix; + + auto row_major_offset = + (sg_startx * TM) * N + (sg_starty / sg_size * TN); + auto col_major_offset = + (sg_startx * TM) + (sg_starty / sg_size * TN) * M; + + joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, + layout::col_major); + + joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset, + N, layout::row_major); + + joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset, + M, layout::col_major); + }); // parallel for + }; + q.submit(work).wait(); } template void run_matrix_test() { diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp index 5441df5fe2542..db8eda82ef239 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp @@ -28,16 +28,18 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] - +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -59,7 +61,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -71,14 +73,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp index 4dcb60f4330fc..7e7edb700debb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp @@ -28,15 +28,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -57,7 +61,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -69,14 +73,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); From 0a71b8844cefb148be74961576b86d57ff6f062e Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 15 May 2024 15:37:33 -0700 Subject: [PATCH 05/42] clang-format --- sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp | 3 +-- 9 files changed, 13 insertions(+), 14 deletions(-) diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp index b424a01a7c6a6..32ceaf8c730a0 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp @@ -39,7 +39,7 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp index 8e9880235c2b2..b48e46e18de3d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp @@ -102,8 +102,8 @@ int init_and_multiply() { big_matrix MBvnni( (Ta *)&Bvnni); - matrix_multiply(MC, MA, MBvnni); + matrix_multiply(MC, MA, MBvnni); matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K); bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D); diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp index 796bdce8d0752..3d3c6304952e5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp @@ -69,8 +69,8 @@ int main() { matrix_verify_add( MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); }); - matrix_verify_add(MA, 7.0, - apply_add()); + matrix_verify_add( + MA, 7.0, apply_add()); std::cout << "Passed\n"; return 0; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp index bc317ffc27d31..9aefc370bd0c6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp @@ -31,11 +31,11 @@ void matrix_multiply(big_matrix &C, big_matrix &A, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { + { // Matrix API has to be accessed by all the workitems in a // subgroup. These functions will be called once by the subgroup. // No code divergence between the workitems. diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp index 068506cc63724..aef22d35f7d17 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp @@ -27,7 +27,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp index 36ce0f81f0c63..6a7182c41985d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp @@ -25,7 +25,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp index 54861eb3b1d3b..8ac48511c7e10 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp @@ -30,11 +30,11 @@ void matrix_copy(big_matrix &C, big_matrix &A) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp index 536fa84581f27..69991884c0710 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp @@ -35,7 +35,7 @@ void matrix_multiply(big_matrix &C, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index faea43b062477..5de94de6a18ba 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -11,8 +11,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -template -class LS; +template class LS; template void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, From 5621804564637714f3e28356184a122279ec582c Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Thu, 16 May 2024 08:15:33 -0700 Subject: [PATCH 06/42] Clean up nits --- sycl/test-e2e/Matrix/element_wise_abc_impl.hpp | 1 - sycl/test-e2e/Matrix/get_coord_float_matC.cpp | 4 ---- sycl/test-e2e/Matrix/get_coord_int8_matA.cpp | 4 ---- sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp | 3 --- sycl/test-e2e/Matrix/get_coord_int8_matB.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp | 3 --- .../Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_half.cpp | 3 --- .../test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp | 3 --- 11 files changed, 34 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index dea0cf882eaaf..655fa90275f40 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -15,7 +15,6 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define TM 8 #define TK 32 -// class add; template diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp index 57c9a00d98fd4..af7e8e1745781 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp @@ -11,10 +11,6 @@ // RUN: %{run} %t.out #include "common.hpp" -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp index 67fa811f2d764..d29217577443e 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp @@ -11,10 +11,6 @@ // RUN: %{run} %t.out #include "common.hpp" -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp index 6f57ab5b4e63c..3f39ebf731801 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp @@ -131,9 +131,6 @@ int main() { big_matrix MA((int8_t *)&A); - size_t NDRangeM = MATRIX_M / TM; - size_t NDRangeK = MATRIX_K / TK; - for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_K; j++) { A[i][j] = i + j; diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp index fe87e9a911b7b..be35a4e672c30 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp @@ -12,10 +12,6 @@ // XFAIL: * #include "common.hpp" -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp index 0eb13cf57347c..1478914d1e44f 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - // Sub-matrix N dimension static constexpr size_t SN = 16; diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp index 2222cbb605a15..1985bcb6a4fb9 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 19d12915b4a95..21d5f1239cd8d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -17,9 +17,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp index 9281e47f572d2..0bacfa93792d6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp @@ -14,9 +14,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp index fb29cc2baaf74..37769a41f7003 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -17,9 +17,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp index 8dcddb841721d..f592057ce94d5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_int8_vnni_impl.hpp" From 23a7afcd215b08be355c220a11d7ee8c4b93135b Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Thu, 16 May 2024 12:22:22 -0700 Subject: [PATCH 07/42] Managed to remove the static code --- .../joint_matrix_bf16_fill_k_cache_impl.hpp | 5 +- .../Matrix/joint_matrix_transposeC_impl.hpp | 83 ++++++++++--------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 36cfb5ea1f069..51382467c0459 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -83,7 +83,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); - static auto work = [&](handler &h) { + q.submit([&](handler &h) { h.parallel_for( // cache layer#1 nd_range<2>{global, cachelocal}, // loop global @@ -275,8 +275,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { } // m #endif }); // parallel_for - }; // queue.submit - q.submit(work); + }); // queue.submit if (i == testIterations - 1) q.wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index 5de94de6a18ba..24ba24a264f0d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -26,49 +26,50 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, size_t NDRangeN = N / TN; size_t sg_size = get_sg_size>(q); - static auto work = [&](handler &cgh) { - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) #ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] + [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { - auto p_input = - address_space_cast(input); - - auto p_out_col_major = - address_space_cast(out_col_major); - auto p_out_row_major = - address_space_cast(out_row_major); - - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_matrix; - - auto row_major_offset = - (sg_startx * TM) * N + (sg_starty / sg_size * TN); - auto col_major_offset = - (sg_startx * TM) + (sg_starty / sg_size * TN) * M; - - joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, - layout::col_major); - - joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset, - N, layout::row_major); - - joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset, - M, layout::col_major); - }); // parallel for - }; - q.submit(work).wait(); + { + auto p_input = + address_space_cast(input); + + auto p_out_col_major = + address_space_cast(out_col_major); + auto p_out_row_major = + address_space_cast(out_row_major); + + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_matrix; + + auto row_major_offset = + (sg_startx * TM) * N + (sg_starty / sg_size * TN); + auto col_major_offset = + (sg_startx * TM) + (sg_starty / sg_size * TN) * M; + + joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, + layout::col_major); + + joint_matrix_store(sg, sub_matrix, + p_out_col_major + row_major_offset, N, + layout::row_major); + + joint_matrix_store(sg, sub_matrix, + p_out_row_major + col_major_offset, M, + layout::col_major); + }); // parallel for + }).wait(); } template void run_matrix_test() { From a3c310b7994b4a507a218aee0eb21404d066fc80 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 08:03:57 -0700 Subject: [PATCH 08/42] Pass: elemwise_irreg_size_ops_bf16.cpp --- .../Matrix/elemwise_irreg_size_ops_bf16.cpp | 36 +++++-------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp index 7ad89965f5243..4d453fe35da5c 100644 --- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp +++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp @@ -16,10 +16,7 @@ #include #include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::bfloat16; +#include "common.hpp" // 10x12 is not multiply the sg size, slicing implementation will have to insert // padding @@ -27,16 +24,6 @@ using bfloat16 = sycl::ext::oneapi::bfloat16; #define TN 12 #define TK 16 -template struct big_matrix { -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) {} -}; - template @@ -55,16 +42,18 @@ void matrix_multiply(big_matrix &C, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] - +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -85,7 +74,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K; k += TK) { joint_matrix_load( @@ -97,7 +86,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k) * (N) + sg_starty / SG_SZ * TN * 2, + (k) * (N) + sg_starty / sg_size * TN * 2, N * 2); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } @@ -105,7 +94,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); @@ -119,13 +108,6 @@ bfloat16 B[MATRIX_K / 2][MATRIX_N * 2]; float C[MATRIX_M][MATRIX_N]; float D[MATRIX_M][MATRIX_N]; -float make_fp32(bfloat16 x) { - unsigned int y = *((int *)&x); - y = y << 16; - float *res = reinterpret_cast(&y); - return *res; -} - void matrix_multiply_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { for (int m = 0; m < M; m++) From 22da1c2a6d2c577d7aa94e8e41a037c6cbf6fcef Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 08:15:04 -0700 Subject: [PATCH 09/42] Pass: joint_matrix_annotated_ptr --- .../joint_matrix_annotated_ptr_impl.hpp | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp index 38e331bb04181..2eeba80572608 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp @@ -11,16 +11,22 @@ #define TM 8 #define TK 16 +template +class mult; + template void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) { size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; + size_t sg_size = get_sg_size>(q); q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); @@ -53,20 +59,20 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) { syclintelex::cache_control>}}; joint_matrix_load( - sg, sub_c, C_ptr + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + sg, sub_c, C_ptr + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load(sg, sub_a, A_ptr + (sg_startx * TM) * K + k * TK, K); if constexpr (vnniFactor == 0) { joint_matrix_load( - sg, sub_b, B_ptr + (k * TK) * N + sg_starty / SG_SZ * TN, N); + sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN, N); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } else { joint_matrix_load(sg, sub_bp, B_ptr + (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / SG_SZ * TN * vnniFactor, + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_mad(sg, sub_c, sub_a, sub_bp, sub_c); @@ -79,7 +85,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) { syclex::cache_level::L2>>}}; joint_matrix_store(sg, sub_c, C_w_ptr + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); From 850f30b7177f46fb7f2ad9de3915616fd67c5c35 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 08:17:56 -0700 Subject: [PATCH 10/42] Pass: joint_matrix_bfloat16_colmajorA_colmajorB --- ...t_matrix_bfloat16_colmajorA_colmajorB_impl.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp index f7a78a1c8da87..6771795c70a0b 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp @@ -19,15 +19,18 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -48,7 +51,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -59,14 +62,14 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (sg_starty / SG_SZ * TN) * K + k * TK, + (sg_starty / sg_size * TN) * K + k * TK, K); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); From e7fcb5c6cf36e59447343b7ac833adffac62c45a Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 08:21:23 -0700 Subject: [PATCH 11/42] Pass: joint_matrix_int8_colmajorA_colmajorB --- .../joint_matrix_int8_colmajorA_colmajorB_impl.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp index 5123003769465..1390f8225406c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB_impl.hpp @@ -25,15 +25,18 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -60,14 +63,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (sg_starty / SG_SZ * TN) * K + k * TK, + (sg_starty / sg_size * TN) * K + k * TK, K); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); From 596aaffaf0e5dec1c79e72bf3da4099c1a6238ee Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 08:27:49 -0700 Subject: [PATCH 12/42] Pass: joint_matrix_prefetch --- .../Matrix/joint_matrix_prefetch_impl.hpp | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp index e4111526b4696..a2c5864fc0f14 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp @@ -11,10 +11,13 @@ #define TM 8 #define TK 16 +template +class mult; + template void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty, - T1 *A, T2 *B, T *C) { + size_t sg_size, T1 *A, T2 *B, T *C) { auto pA = address_space_cast(A); auto pB = address_space_cast( - sg, B + sg_starty / SG_SZ * TN * vnniFactor, N * vnniFactor, B_layout, + sg, B + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor, B_layout, syclex::properties{syclex::prefetch_hint_L1}); joint_matrix_prefetch( - sg, C + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, + sg, C + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major, syclex::properties{syclex::prefetch_hint_L1}); joint_matrix_fill(sg, sub_c, 1); for (int k = 0; k < K; k += TK) { joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K); joint_matrix_load(sg, sub_b, - pB + k * N + sg_starty / SG_SZ * TN * vnniFactor, + pB + k * N + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store(sg, sub_c, - pC + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, + pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); } @@ -54,11 +57,14 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) { size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; + size_t sg_size = get_sg_size>(q); q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); @@ -67,7 +73,7 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) { sub_group sg = spmd_item.get_sub_group(); joint_matrix_gemm_vnni( - sg, sg_startx, sg_starty, A, B, C); + sg, sg_startx, sg_starty, sg_size, A, B, C); }); // parallel for }).wait(); } From 3563d69469e20fc9c2e6a730bbf70f2d28ed534b Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 08:44:00 -0700 Subject: [PATCH 13/42] Fixed sycl merge and joint_matrix_prefetch --- sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp index 7b6e5b9a861ca..56c8af2325ac1 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp @@ -11,7 +11,7 @@ #define TM 8 #define TK 16 -template +template class mult; template >(q); + size_t sg_size = get_sg_size>(q); q.submit([&](handler &cgh) { - cgh.parallel_for>( + cgh.parallel_for>( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), [=](nd_item<2> spmd_item) #ifdef SG_SZ From c1bca5e1798362404bd55d36f7c6f5b1bcfd52bb Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 09:08:21 -0700 Subject: [PATCH 14/42] Fixed CPU tests --- sycl/test-e2e/Matrix/get_coord_int8_matB.cpp | 1 - .../Matrix/get_coord_int8_matB_impl.hpp | 95 ++++++++++--------- .../joint_matrix_colA_rowB_colC_impl.hpp | 17 ++-- .../Matrix/joint_matrix_out_bounds_impl.hpp | 24 +++-- 4 files changed, 73 insertions(+), 64 deletions(-) diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp index ad064fd82fc0a..feac65bf0e4bf 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp @@ -9,7 +9,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu #include "common.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp index d0f77a288d938..480f01ca77ceb 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp @@ -94,55 +94,65 @@ wi [1,0] --> i=0, [8, 0] // clang-format on template -void matrix_sum_cols(queue q, big_matrix &B, - big_matrix &Bvnni, nd_range<2> &r) { +void matrix_sum_cols(big_matrix &B, + big_matrix &Bvnni) { buffer bufB(B.get_data(), range<2>(K, N)); buffer bufBvnni(Bvnni.get_data(), range<2>(K / VF, N * VF)); int sum_cols[N] = {0}; buffer sum_cols_v(sum_cols, N); + size_t NDRangeK = K / TK; + size_t NDRangeN = N / TN; + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size}); + q.submit([&](handler &cgh) { auto accB = bufBvnni.get_access(cgh); auto v = sum_cols_v.get_access(cgh); - cgh.parallel_for(r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size( - SG_SZ)]] { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - - joint_matrix - sub_b; - - joint_matrix_load(sg, sub_b, - accB.template get_multi_ptr() + - (sg_startx * (TK / VF) * N * VF) + - sg_starty / SG_SZ * TN * VF, - N * VF); - - int32_t sum_local_cols[N] = {0}; - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_b, [&](int8_t &x, size_t row, size_t col) { - // the coordinates returned are in the logical range [K,N] - // If users want to retrieve the VNNIed coordinates, they can be - // obtained using - // colVNNI = col/VF - // rowVNNI = row*VF - size_t global_index = col + global_idy / SG_SZ * TN; - sum_local_cols[global_index] += x; - }); - - for (int i = 0; i < N; i++) { - sum_local_cols[i] = - reduce_over_group(sg, sum_local_cols[i], sycl::plus<>()); - if (global_idy % SG_SZ == 0) - atomic_fetch_add(v[i], sum_local_cols[i]); - } - }); // parallel for + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::sub_group sg = spmd_item.get_sub_group(); + + joint_matrix + sub_b; + + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (sg_startx * (TK / VF) * N * VF) + + sg_starty / sg_size * TN * VF, + N * VF); + + int32_t sum_local_cols[N] = {0}; + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub_b, [&](int8_t &x, size_t row, size_t col) { + // the coordinates returned are in the logical range [K,N] + // If users want to retrieve the VNNIed coordinates, they can + // be obtained using colVNNI = col/VF rowVNNI = row*VF + size_t global_index = col + global_idy / sg_size * TN; + sum_local_cols[global_index] += x; + }); + + for (int i = 0; i < N; i++) { + sum_local_cols[i] = + reduce_over_group(sg, sum_local_cols[i], sycl::plus<>()); + if (global_idy % sg_size == 0) + atomic_fetch_add(v[i], sum_local_cols[i]); + } + }); // parallel for }).wait(); sum_cols_ref(bufB.get_host_access(), sum_cols_v.get_host_access()); } @@ -158,11 +168,6 @@ int main() { int8_t Bvnni[MATRIX_K / VF][MATRIX_N * VF]; big_matrix MBvnni((int8_t *)&Bvnni); - size_t NDRangeK = MATRIX_K / TK; - size_t NDRangeN = MATRIX_N / TN; - queue q; - nd_range<2> r({NDRangeK, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); - for (int i = 0; i < MATRIX_K; i++) { for (int j = 0; j < MATRIX_N; j++) { B[i][j] = i + j; @@ -170,7 +175,7 @@ int main() { } matrix_vnni(MATRIX_K, MATRIX_N, *B, *Bvnni, VF); // This test calculates sum of columns in the non VNNI B matrix - matrix_sum_cols(q, MB, MBvnni, r); + matrix_sum_cols(MB, MBvnni); std::cout << "Passed\n"; return 0; } \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp index 3347f9e7cc39e..ae0620f3c5459 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp @@ -10,8 +10,6 @@ #include #include -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; constexpr size_t TM = 8; constexpr size_t TK = 16; @@ -27,12 +25,15 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B); size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { auto pA = address_space_cast #include -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TM = 8; constexpr size_t TK = 16; +template +class mult; + template @@ -27,11 +27,15 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { // Add one iteration for the out of bounds dpas instruction size_t NDRangeM = M / TM + (((M % TM) != 0) ? 1 : 0); size_t NDRangeN = N / TN; - q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + size_t sg_size = get_sg_size>(q); + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { auto pA = address_space_cast sub_c; // bounds-checked load where width and height are added ext::intel::experimental::matrix::joint_matrix_fill_checked( - sg, sub_c, 1, N, M, N, sg_startx * TM, sg_starty / SG_SZ * TN); + sg, sub_c, 1, N, M, N, sg_startx * TM, sg_starty / sg_size * TN); for (int k = 0; k < K; k += TK) { // bounds-checked load where width and height are added ext::intel::experimental::matrix::joint_matrix_load_checked( @@ -69,13 +73,13 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { // bounds-checked load where width and height are added ext::intel::experimental::matrix::joint_matrix_load_checked( sg, sub_b, pB, N * vnniFactor, K / vnniFactor, N * vnniFactor, - k, sg_starty / SG_SZ * TN * vnniFactor); + k, sg_starty / sg_size * TN * vnniFactor); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } // bounds-checked store where width and height are added ext::intel::experimental::matrix::joint_matrix_store_checked( sg, sub_c, pC, N, layout::row_major, M, N, sg_startx * TM, - sg_starty / SG_SZ * TN); + sg_starty / sg_size * TN); }); // parallel for }).wait(); } From 3fe79dafd3856a3043ca347f8324817025935e32 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 09:20:04 -0700 Subject: [PATCH 15/42] clang-format --- .../Matrix/elemwise_irreg_size_ops_bf16.cpp | 2 +- .../Matrix/joint_matrix_annotated_ptr_impl.hpp | 13 +++++++------ ...t_matrix_bfloat16_colmajorA_colmajorB_impl.hpp | 2 +- .../Matrix/joint_matrix_out_bounds_impl.hpp | 5 ++--- .../Matrix/joint_matrix_prefetch_impl.hpp | 15 +++++++-------- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp index 4d453fe35da5c..10c5e195a5344 100644 --- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp +++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp @@ -14,9 +14,9 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +#include "common.hpp" #include #include -#include "common.hpp" // 10x12 is not multiply the sg size, slicing implementation will have to insert // padding diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp index 2eeba80572608..5463ea040d1eb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr_impl.hpp @@ -11,8 +11,7 @@ #define TM 8 #define TK 16 -template -class mult; +template class mult; template @@ -58,15 +57,17 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue &q) { syclex::properties{syclintelex::read_hint< syclintelex::cache_control>}}; - joint_matrix_load( - sg, sub_c, C_ptr + (sg_startx * TM) * N + sg_starty / sg_size * TN, - N, layout::row_major); + joint_matrix_load(sg, sub_c, + C_ptr + (sg_startx * TM) * N + + sg_starty / sg_size * TN, + N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load(sg, sub_a, A_ptr + (sg_startx * TM) * K + k * TK, K); if constexpr (vnniFactor == 0) { joint_matrix_load( - sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN, N); + sg, sub_b, B_ptr + (k * TK) * N + sg_starty / sg_size * TN, + N); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } else { joint_matrix_load(sg, sub_bp, diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp index 6771795c70a0b..e3234da2cd5d9 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp @@ -27,7 +27,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp index d186fdcad03a4..4be21beea9c45 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds_impl.hpp @@ -12,8 +12,7 @@ constexpr size_t TM = 8; constexpr size_t TK = 16; -template -class mult; +template class mult; template >( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp index 56c8af2325ac1..9d9c99bf4ae1a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch_impl.hpp @@ -11,8 +11,7 @@ #define TM 8 #define TK 16 -template -class mult; +template class mult; template @@ -43,7 +42,7 @@ void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty, joint_matrix_prefetch( sg, C + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, C_layout, syclex::properties{syclex::prefetch_hint_L1}); - + joint_matrix_fill(sg, sub_c, 1); for (int k = 0; k < K; k += TK) { joint_matrix_load(sg, sub_a, pA + (sg_startx * TM) * K + k, K); @@ -55,8 +54,8 @@ void joint_matrix_gemm_vnni(sub_group sg, size_t sg_startx, size_t sg_starty, if constexpr (C_layout == layout::col_major) joint_matrix_store(sg, sub_c, - pC + (sg_starty / sg_size * TN) * M + (sg_startx * TM), M, - C_layout); + pC + (sg_starty / sg_size * TN) * M + (sg_startx * TM), + M, C_layout); else joint_matrix_store(sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, @@ -73,7 +72,7 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) { q.submit([&](handler &cgh) { cgh.parallel_for>( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif @@ -85,8 +84,8 @@ void matrix_multiply(T *C, T1 *A, T2 *B, queue q) { sub_group sg = spmd_item.get_sub_group(); joint_matrix_gemm_vnni(sg, sg_startx, sg_starty, sg_size, A, B, - C); + vnniFactor>(sg, sg_startx, sg_starty, sg_size, + A, B, C); }); // parallel for }).wait(); } From 8dc3756d31095fc476d7dcb2a56e9e45caafdf35 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 09:44:00 -0700 Subject: [PATCH 16/42] clang-format --- sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp index ae0620f3c5459..373ec652cc063 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC_impl.hpp @@ -10,7 +10,6 @@ #include #include - constexpr size_t TM = 8; constexpr size_t TK = 16; @@ -30,7 +29,7 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { q.submit([&](handler &cgh) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif @@ -67,8 +66,8 @@ void matrix_multiply(T1 *C, T2 *A, T2 *B, queue q) { joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( - sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, - layout::col_major); + sg, sub_c, pC + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::col_major); }); // parallel for }).wait(); } From 516f6484c0d1409154889179d603575d02bdbd9f Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 12:45:24 -0700 Subject: [PATCH 17/42] Undo changes --- .../Matrix/elemwise_irreg_size_ops_bf16.cpp | 19 ++++++++++--------- sycl/test-e2e/Matrix/get_coord_int8_matB.cpp | 1 + 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp index 10c5e195a5344..6a4632305efb3 100644 --- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp +++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp @@ -16,7 +16,10 @@ #include "common.hpp" #include -#include +#include +#include + +#define SG_SZ 16 // 10x12 is not multiply the sg size, slicing implementation will have to insert // padding @@ -42,18 +45,16 @@ void matrix_multiply(big_matrix &C, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; - size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] -#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -74,7 +75,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); for (int k = 0; k < K; k += TK) { joint_matrix_load( @@ -86,7 +87,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k) * (N) + sg_starty / sg_size * TN * 2, + (k) * (N) + sg_starty / SG_SZ * TN * 2, N * 2); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } @@ -94,7 +95,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp index feac65bf0e4bf..ad064fd82fc0a 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp @@ -9,6 +9,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// XFAIL: cpu #include "common.hpp" From b48d61d81527d393e0218cf911a6e429135be437 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 14 May 2024 14:13:38 -0700 Subject: [PATCH 18/42] SG32 #define SG_SZ --- sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp | 2 +- sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp | 2 +- sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp | 2 +- sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp | 2 +- sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp | 2 +- .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp | 2 +- .../Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp | 2 +- .../Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp | 2 +- .../Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp | 2 +- sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp index 182ec8e81233d..4833404610369 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp index 7b9655fe62416..3916aaff03867 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp index e88f0a0a135f5..ddfa39c541c0a 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp index 8a91d404f6948..ad644c8734475 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp index 06c1f5d3f5c96..06d459a2a3ce5 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_tf32.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp index 4824ff2568d30..4624110577ea2 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_sizes.cpp @@ -13,6 +13,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp index 3bdd2ed83b08d..9d38fb7afa30d 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp @@ -17,7 +17,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp index 79383fce4b7fc..13d8df56f40a1 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp @@ -17,7 +17,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp index 78b95b3ee53d6..5b77ec89fd997 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp @@ -18,7 +18,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp index b9660e73e3ab2..46de02fe8f525 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_all_sizes.cpp @@ -17,7 +17,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 // Sub-matrix N dimension static constexpr size_t SN = 16; diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp index 0ce3d22bc873b..c38d8f133264d 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_apply_bf16.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp index 7040058dc8554..b93985f8e594e 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp @@ -19,7 +19,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp index 3a023df7b10f8..10391f2e7e319 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp @@ -16,7 +16,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp index 1b7a8ed351139..994a2217d681f 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -21,7 +21,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 8c52421657229..4f7e3638daaf3 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -18,7 +18,7 @@ #include "../common.hpp" #include -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp index fc7d0c9e4eba2..2ea58e9953917 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16.cpp @@ -20,7 +20,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; using bfloat16 = sycl::ext::oneapi::bfloat16; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 06798015261e7..6532bcfe47bff 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -25,7 +25,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; using bfloat16 = sycl::ext::oneapi::bfloat16; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp index e2158368ff6f8..70e53441cb48f 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_colA_rowB_colC.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp index 52d8bc9c6f4a4..b474f846d11d5 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_down_convert.cpp @@ -13,6 +13,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 #include "../joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp index cb7b15819f2bb..f4dd217655439 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_half.cpp @@ -18,7 +18,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp index d7289579098e9..c89c657c77fbc 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -21,7 +21,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp index 09c4d6059750c..c8ee58e126732 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_int8_vnni.cpp @@ -18,7 +18,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_int8_vnni_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp index ed7fb96ca104a..1848a480a0eb7 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_out_bounds.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; constexpr size_t MATRIX_K = 1024 + 24; diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp index 6b059ed357781..b193d422c2b8c 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_ss_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp index 5a13d4c1f1807..cfd89fcb8a1bf 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_su_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp index 9a82aa8bb647a..18da250bc808d 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_tf32.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp index 504e7beac85e3..214dd10f5158f 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp @@ -13,7 +13,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp index 3532e5cc4e3ba..f4b2426af93a8 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_unaligned_k.cpp @@ -15,7 +15,7 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; static constexpr size_t MATRIX_K = 1024 + 14; diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp index a4292269811f1..aec91f70bd1d7 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_us_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp index 842977311cafa..b2d6510622736 100644 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_uu_int8.cpp @@ -16,7 +16,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 32; +#define SG_SZ 32 constexpr size_t TN = 16; #include "../joint_matrix_uu_int8_impl.hpp" From c23311c62be3f852f71f18798fef79d3f4226699 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 14 May 2024 14:19:53 -0700 Subject: [PATCH 19/42] XMX8 no SG_SZ --- sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp | 1 - sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp | 1 - sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp | 1 - sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp | 1 - sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp | 1 - .../test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp | 1 - .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp | 1 - .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp | 1 - sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp | 1 - 29 files changed, 29 deletions(-) diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp index aa2d2e28ac468..d7df42000249a 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp index f360bdbba6ada..826b99dfcf306 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp index 6f3aedfe506d5..a39cb6664d100 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp index ca425f7ded5d1..9ff39c8d516d0 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp index b9d49bba70abb..5bae6a3184808 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp @@ -13,7 +13,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp index 2975ab9edf6c4..87adf891cd16b 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp @@ -16,7 +16,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp index 5aa1cd8a2a0d7..d86af51e3cd86 100644 --- a/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/XMX8/get_coord_float_matC.cpp @@ -18,7 +18,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp index ece88423d0f43..e815b46e1ed21 100644 --- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matA.cpp @@ -18,7 +18,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp index a84580c3f846c..4c4d6c6eb5765 100644 --- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp index be1ac0f24e88c..32b8c3bc6e24f 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_all_sizes.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t SN = 8; #include "../joint_matrix_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp index f02028d31e7ed..614a67db9ff8a 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_apply_bf16.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp index b52e8085be172..fbcd21be62f75 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp @@ -15,7 +15,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp index 2e05e656e5379..c5e399bc98f48 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp @@ -15,7 +15,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp index 18238e4896ccb..ba24ea0dfc4b8 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -17,7 +17,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index 49b5e6eebb4ac..9d88c89c50f41 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -17,7 +17,6 @@ #include "../common.hpp" #include -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp index 008db77761e3d..173ac16a42afc 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp index b72e2ed83841c..5a41f19bc2ac1 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_bfloat16_32x64_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp index e6371806f3592..09c1a4ae32a92 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 static constexpr int TN = 8; #include "../joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp index 494a84c173edb..7d74bf8055d6b 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp index dbe060711b02a..419cc936f14e4 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_half.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp index 728a057aedaa7..3dadaeebee511 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_int8_vnni.cpp @@ -12,7 +12,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_int8_vnni_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp index 532af4dc5d844..07a48bd44fccd 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp @@ -8,7 +8,6 @@ #include "../common.hpp" -#define SG_SZ 8 constexpr size_t SN = 8; #include "../joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp index 944cccd310d3e..0ba69032465b9 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; static constexpr size_t MATRIX_K = 1024 + 24; diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp index 4a3770be74f91..fbd97d215498d 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_ss_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp index d5c7a74c20aff..2694d0135c6a1 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_su_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp index 672e8b87e22e6..a0a98e3f16d0c 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp @@ -13,7 +13,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; #include "../joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp index aa8e00c08b658..f42f37378514d 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp @@ -14,7 +14,6 @@ #include "../common.hpp" -constexpr size_t SG_SZ = 8; constexpr size_t TN = 8; constexpr size_t MATRIX_K = 1024 + 14; diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp index 56feaaec924ad..0c5f46f6fcec6 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_us_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp index a1643332e489f..bc08632463f22 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_uu_int8.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 8 constexpr size_t TN = 8; #include "../joint_matrix_uu_int8_impl.hpp" From db8cd7ee26e1abc33bbb0b52623e669caeffc709 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 15 May 2024 07:39:06 -0700 Subject: [PATCH 20/42] WIP abc_impl: remove SG_SZ --- sycl/test-e2e/Matrix/element_wise_abc.cpp | 1 - sycl/test-e2e/Matrix/element_wise_abc_impl.hpp | 16 ++++++++++++---- .../Matrix/element_wise_all_ops_half.cpp | 1 - .../Matrix/element_wise_all_ops_int8.cpp | 1 - .../Matrix/element_wise_all_ops_int8_packed.cpp | 1 - .../Matrix/element_wise_all_ops_tf32.cpp | 1 - sycl/test-e2e/Matrix/element_wise_all_sizes.cpp | 3 --- .../Matrix/elemwise_irreg_size_ops_bf16.cpp | 2 -- sycl/test-e2e/Matrix/get_coord_float_matC.cpp | 1 - sycl/test-e2e/Matrix/get_coord_int8_matA.cpp | 1 - sycl/test-e2e/Matrix/get_coord_int8_matB.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp | 1 - .../Matrix/joint_matrix_annotated_ptr.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp | 1 - .../Matrix/joint_matrix_bf16_fill_k_cache.cpp | 1 - .../joint_matrix_bf16_fill_k_cache_init.cpp | 2 +- .../joint_matrix_bf16_fill_k_cache_unroll.cpp | 1 - ...oint_matrix_bf16_fill_k_cache_unroll_init.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp | 1 - .../Matrix/joint_matrix_bfloat16_array.cpp | 2 +- ...joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 1 - .../Matrix/joint_matrix_bfloat16_packedB.cpp | 2 -- .../Matrix/joint_matrix_colA_rowB_colC.cpp | 1 - .../Matrix/joint_matrix_down_convert.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_half.cpp | 1 - .../joint_matrix_int8_colmajorA_colmajorB.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp | 1 - .../Matrix/joint_matrix_opt_kernel_feature.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp | 1 - .../Matrix/joint_matrix_rowmajorA_rowmajorB.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_tf32.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp | 1 - .../test-e2e/Matrix/joint_matrix_unaligned_k.cpp | 1 - sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp | 4 ---- 38 files changed, 14 insertions(+), 65 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/element_wise_abc.cpp index c9954fee4f898..0a6a4e4abaa03 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/element_wise_abc.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index bf8b2ecb4df85..8c08bfad7a867 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -15,6 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define TM 8 #define TK 32 +class add; template @@ -27,14 +28,21 @@ void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + std::cout << "Artem: before get_sg_size()\n"; + size_t sg_size = get_sg_size(q); + std::cout << "Artem: after get_sg_size()\n"; q.submit([&](handler &cgh) { accessor accC{bufC, cgh}; accessor accA{bufA, cgh}; accessor accB{bufB, cgh}; cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -60,14 +68,14 @@ void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - sg_starty / SG_SZ * TN * vnniFactor, + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; }); joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; }); }); // parallel for diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp index fae692ff39ed9..c07d19ed73f2e 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp index 93ddcefc19ac3..e1a2cf4eecfa1 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp index 2d79d945e8980..24f82f47e8fcd 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp index 28483b5c2092e..6e2f8dcff6384 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_tf32.cpp @@ -13,7 +13,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "element_wise_all_ops_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp index 661027237f836..1c07e494fcc47 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp @@ -12,7 +12,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -#define SG_SZ 16 - #include "element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp index 5d923f4fb908d..7abcf7f69ab4b 100644 --- a/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp +++ b/sycl/test-e2e/Matrix/elemwise_irreg_size_ops_bf16.cpp @@ -22,8 +22,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; using bfloat16 = sycl::ext::oneapi::bfloat16; -#define SG_SZ 16 - // 10x12 is not multiply the sg size, slicing implementation will have to insert // padding #define TM 10 diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp index 78a6f815df19c..57c9a00d98fd4 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp @@ -16,7 +16,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp index 6500a34f48119..67fa811f2d764 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp @@ -16,7 +16,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp index 9fda659505c43..782df68d21ff5 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "get_coord_int8_matB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp index 408a6087206ea..0eb13cf57347c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 // Sub-matrix N dimension static constexpr size_t SN = 16; diff --git a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp index 265532c140e12..7aad02b2066a1 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_annotated_ptr.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_annotated_ptr_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp index 82ad15285a4fa..d58677fa2c178 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_apply_bf16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp index 0c93876db2a15..abee7d7259f28 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp @@ -18,7 +18,6 @@ #include "common.hpp" #include -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp index 7206cb165349b..d839f3db8f481 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp @@ -14,7 +14,7 @@ #include "common.hpp" #include -#define SG_SZ 16 + constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp index 5518d9cb08fbc..1800901e24111 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp @@ -20,7 +20,6 @@ #include "common.hpp" #include -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp index a393f3a2ad729..701c17741f576 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp @@ -17,7 +17,6 @@ #include "common.hpp" #include -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp index d1410ac68276e..2222cbb605a15 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp index 80e1f310ce440..98ed155b297ad 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp @@ -11,7 +11,7 @@ // RUN: %{run} %t.out #include "common.hpp" -#define SG_SZ 16 + static constexpr int TN = 16; #include "joint_matrix_bfloat16_array_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 9cd31a8c5178e..19d12915b4a95 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -20,7 +20,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp index 3e80168752545..0d592e04b606c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB.cpp @@ -12,6 +12,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -#define SG_SZ 16 #include "joint_matrix_bfloat16_packedB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp index 7d114175dff13..354a71006e129 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_colA_rowB_colC.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "joint_matrix_colA_rowB_colC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp index caea640677aa7..dee504c22e7f6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert.cpp @@ -11,7 +11,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -constexpr size_t SG_SZ = 16; - #include "joint_matrix_down_convert_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp index ac09361a0799c..9281e47f572d2 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp @@ -17,7 +17,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp index 33c00022a5a76..fb29cc2baaf74 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -20,7 +20,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp index 02813c6720deb..8dcddb841721d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp @@ -15,7 +15,6 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_int8_vnni_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp index 6195ee2935892..031c7753de425 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp @@ -16,7 +16,6 @@ #include "common.hpp" -#define SG_SZ 16 static constexpr size_t SN = 16; #include "joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp index a5302b9ee7a57..f3485408373b9 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_out_bounds.cpp @@ -16,7 +16,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; constexpr size_t MATRIX_K = 1024 + 24; diff --git a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp index 30d9278e07157..7abea83c6d287 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_prefetch.cpp @@ -13,6 +13,5 @@ #include "common.hpp" -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_prefetch_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp index 958bd94fe0cd3..77df6085bc09a 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB.cpp @@ -16,8 +16,4 @@ // transform. This is currently only available on AMX and XMX of PVC #include "common.hpp" - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - #include "joint_matrix_rowmajorA_rowmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp index e487b8cdcb41d..2089e0185b0e0 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_ss_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp index 72910c4ed5446..7a02d03b9d642 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_su_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp index 6f34a4acbea61..922b79f356e78 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32.cpp @@ -13,10 +13,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_tf32_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp index f98c8bd3c7b48..bd04b157cf667 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp @@ -12,7 +12,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; #include "joint_matrix_transposeC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp index 212ac34a3a640..e1cf6cb6cf8bb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_unaligned_k.cpp @@ -14,7 +14,6 @@ #include "common.hpp" -constexpr size_t SG_SZ = 16; constexpr size_t TN = 16; static constexpr size_t MATRIX_K = 1024 + 14; diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp index 409b589904847..f4237b995aad8 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_us_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp index 59a47484a335c..a75d18b9e6967 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8.cpp @@ -12,10 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define SG_SZ 16 constexpr size_t TN = 16; #include "joint_matrix_uu_int8_impl.hpp" From 5877ed2b40f1082a656eb37253f4852cebb302f8 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 15 May 2024 15:21:26 -0700 Subject: [PATCH 21/42] Made tests independant of SG_SZ --- .../XMX8/joint_matrix_opt_kernel_feature.cpp | 3 - .../test-e2e/Matrix/element_wise_abc_impl.hpp | 8 +- .../Matrix/element_wise_all_sizes_impl.hpp | 54 ++++++----- .../Matrix/get_coord_float_matC_impl.hpp | 15 ++-- .../Matrix/get_coord_int8_matA_impl.hpp | 73 ++++++++------- .../Matrix/joint_matrix_all_sizes_impl.hpp | 57 ++++++------ .../Matrix/joint_matrix_apply_bf16_impl.hpp | 57 ++++++------ .../joint_matrix_bf16_fill_k_cache_impl.hpp | 17 ++-- .../joint_matrix_bfloat16_array_impl.hpp | 15 ++-- .../Matrix/joint_matrix_bfloat16_impl.hpp | 15 ++-- .../joint_matrix_bfloat16_packedB_impl.hpp | 15 ++-- .../Matrix/joint_matrix_down_convert_impl.hpp | 15 ++-- .../Matrix/joint_matrix_half_impl.hpp | 90 ++++++++++--------- .../Matrix/joint_matrix_int8_vnni_impl.hpp | 14 +-- .../joint_matrix_opt_kernel_feature.cpp | 3 - .../joint_matrix_opt_kernel_feature_impl.hpp | 18 ++-- .../Matrix/joint_matrix_ss_int8_impl.hpp | 14 +-- .../Matrix/joint_matrix_su_int8_impl.hpp | 16 ++-- .../Matrix/joint_matrix_tf32_impl.hpp | 15 ++-- .../Matrix/joint_matrix_transposeC_impl.hpp | 89 +++++++++--------- .../Matrix/joint_matrix_us_int8_impl.hpp | 14 +-- .../Matrix/joint_matrix_uu_int8_impl.hpp | 16 ++-- 22 files changed, 354 insertions(+), 279 deletions(-) diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp index 07a48bd44fccd..30b3522ad2442 100644 --- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp @@ -7,7 +7,4 @@ // incompatible on the current device #include "../common.hpp" - -constexpr size_t SN = 8; - #include "../joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index 8c08bfad7a867..dea0cf882eaaf 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -15,7 +15,7 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define TM 8 #define TK 32 -class add; +// class add; template @@ -28,15 +28,13 @@ void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; - std::cout << "Artem: before get_sg_size()\n"; - size_t sg_size = get_sg_size(q); - std::cout << "Artem: after get_sg_size()\n"; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { accessor accC{bufC, cgh}; accessor accA{bufA, cgh}; accessor accB{bufB, cgh}; - cgh.parallel_for( + cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), [=](nd_item<2> spmd_item) #ifdef SG_SZ diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp index 4020e8b84bbd2..5800ab9c62745 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp @@ -23,7 +23,7 @@ void assert_ops_ref(host_accessor C, } } -template +template void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { static constexpr size_t M = TM * M_MULTIPLIER; static constexpr size_t K = 128; @@ -32,7 +32,8 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { size_t NDRangeM = M / TM; size_t NDRangeK = K / TK; queue q; - nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ}); + size_t sg_size = get_sg_size(q); + nd_range<2> r({NDRangeM, NDRangeK * sg_size}, {1, 1 * sg_size}); big_matrix A((T *)&MatA); buffer bufA(A.get_data(), range<2>(M, K)); @@ -40,8 +41,12 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { q.submit([&](handler &cgh) { sycl::accessor accA{bufA, cgh, sycl::read_write}; - cgh.parallel_for( - r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); @@ -57,41 +62,42 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { ext::intel::experimental::matrix::joint_matrix_store( sg, sub_a, accA.template get_multi_ptr() + - (sg_startx * TM) * K + sg_starty / SG_SZ * TK, + (sg_startx * TM) * K + sg_starty / sg_size * TK, K); }); // parallel for }).wait(); assert_ops_ref(bufA.get_host_access(), result); } -template void add_ref() { +template +void add_ref() { if constexpr (std::is_same_v) { // Tests whether 5 + 2 = 7 operation is successful. - matrix_verify_add(bfloat16(5.0), bfloat16(2.0), - bfloat16(7.0)); + matrix_verify_add( + bfloat16(5.0), bfloat16(2.0), bfloat16(7.0)); } if constexpr (std::is_same_v) { - matrix_verify_add(5 /*val1*/, 2 /*val2*/, - 7 /*result*/); + matrix_verify_add(5 /*val1*/, 2 /*val2*/, + 7 /*result*/); } } int main() { - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); + add_ref(); std::cout << "Passed\n"; } diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp index bedc91bdc39d4..b424a01a7c6a6 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp @@ -28,13 +28,18 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { buffer sum_rows_v(sum_rows, M); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto v = sum_rows_v.get_access(cgh); - cgh.parallel_for( - nd_range<2>({M / TM, N / TN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({M / TM, N / TN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -49,7 +54,7 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); float sum_local_rows[M] = {0}; @@ -62,7 +67,7 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { sum_local_rows[i] = reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); // only Groups leader perform the global reduction - if (global_idy % SG_SZ == 0) { + if (global_idy % sg_size == 0) { sycl::atomic_ref aref(v[i]); diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp index afda0f90a6e37..6f57ab5b4e63c 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp @@ -72,45 +72,54 @@ W0 --> 0 0 1 1 2 2 3 3 .... 7 7 // clang-format on template -void matrix_sum_rows(queue q, big_matrix &A, nd_range<2> &r) { +void matrix_sum_rows(big_matrix &A) { buffer bufA(A.get_data(), range<2>(M, K)); // size of vector is equal to number of rows in big matrix int sum_rows[M] = {0}; buffer sum_rows_v(sum_rows, M); + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({M / TM, K / TK * sg_size}, {1, 1 * sg_size}); q.submit([&](handler &cgh) { auto accA = bufA.get_access(cgh); auto v = sum_rows_v.get_access(cgh); - cgh.parallel_for(r, [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size( - SG_SZ)]] { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; - joint_matrix_load(sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM * K) + sg_starty / SG_SZ * TK, - K); - - int32_t sum_local_rows[M] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_a, [&](int8_t &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TM] += x; - }); - for (int i = 0; i < M; ++i) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - - // only Groups leader performs the global reduction - if (global_idy % SG_SZ == 0) - atomic_fetch_add(v[i], sum_local_rows[i]); - } - }); // parallel for + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sycl::sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM * K) + sg_starty / sg_size * TK, + K); + + int32_t sum_local_rows[M] = {0}; + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub_a, [&](int8_t &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * TM] += x; + }); + for (int i = 0; i < M; ++i) { + sum_local_rows[i] = + reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); + + // only Groups leader performs the global reduction + if (global_idy % sg_size == 0) + atomic_fetch_add(v[i], sum_local_rows[i]); + } + }); // parallel for }).wait(); sum_rows_ref(bufA.get_host_access(), sum_rows_v.get_host_access()); } @@ -124,8 +133,6 @@ int main() { size_t NDRangeM = MATRIX_M / TM; size_t NDRangeK = MATRIX_K / TK; - queue q; - nd_range<2> r({NDRangeM, NDRangeK * SG_SZ}, {1, 1 * SG_SZ}); for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_K; j++) { @@ -133,7 +140,7 @@ int main() { } } - matrix_sum_rows(q, MA, r); + matrix_sum_rows(MA); std::cout << "Passed\n"; return 0; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp index edfcfe1d2e979..8e9880235c2b2 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp @@ -9,7 +9,7 @@ static constexpr size_t M_MULTIPLIER = 16; template + int vnniFactor, size_t TM, size_t TN, size_t TK, typename kernel_name> void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { size_t NDRangeM = M / TM; @@ -19,15 +19,18 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { sycl::accessor accC{bufC, cgh, sycl::read_write}; sycl::accessor accA{bufA, cgh, sycl::read_only}; sycl::accessor accB{bufB, cgh, sycl::read_only}; - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -47,7 +50,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -59,21 +62,21 @@ void matrix_multiply(big_matrix &C, big_matrix &A, sg, sub_b, accB.template get_multi_ptr() + (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / SG_SZ * TN * vnniFactor, + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); } template + size_t tK, typename kernel_name> int init_and_multiply() { static constexpr size_t MATRIX_M = tM * M_MULTIPLIER; static constexpr size_t MATRIX_N = 128; @@ -100,7 +103,7 @@ int init_and_multiply() { (Ta *)&Bvnni); matrix_multiply(MC, MA, MBvnni); + tK, kernel_name>(MC, MA, MBvnni); matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K); bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D); @@ -110,23 +113,23 @@ int init_and_multiply() { int main() { int errors = 0; - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); - errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); + errors += init_and_multiply(); return errors; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp index 1ec089d0f53f2..796bdce8d0752 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp @@ -13,35 +13,41 @@ template struct apply_add { void operator()(T &x) const { x = x + bfloat16(2); } }; -template -void matrix_verify_add(queue q, big_matrix &A, nd_range<2> &r, - const float ref, F &&lambda) { +template +void matrix_verify_add(big_matrix &A, const float ref, F &&lambda) { buffer bufA(A.get_data(), range<2>(M, N)); + queue q; + size_t sg_size = get_sg_size(q); + nd_range<2> r({M / TM, N / TN * sg_size}, {1, 1 * sg_size}); + q.submit([&](handler &cgh) { accessor accA{bufA, cgh}; - cgh.parallel_for(r, [accA, lambda]( - nd_item<2> spmd_item) [[intel::reqd_sub_group_size( - SG_SZ)]] { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); + cgh.parallel_for( + r, [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a; - joint_matrix_fill(sg, sub_a, bfloat16(5.0)); + joint_matrix_fill(sg, sub_a, bfloat16(5.0)); - joint_matrix_apply(sg, sub_a, lambda); + joint_matrix_apply(sg, sub_a, lambda); - ext::intel::experimental::matrix::joint_matrix_store( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N); - }); // parallel for + ext::intel::experimental::matrix::joint_matrix_store( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N); + }); // parallel for }).wait(); // Check if the results are correct { @@ -61,14 +67,9 @@ int main() { big_matrix MA((bfloat16 *)&A); - size_t NDRangeM = MATRIX_M / TM; - size_t NDRangeN = MATRIX_N / TN; - queue q; - nd_range<2> r({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}); - - matrix_verify_add( - q, MA, r, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); }); - matrix_verify_add(q, MA, r, 7.0, + matrix_verify_add( + MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); }); + matrix_verify_add(MA, 7.0, apply_add()); std::cout << "Passed\n"; return 0; diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 9c12d1053d3bb..6a7b0bb369341 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -68,8 +68,9 @@ static constexpr void manually_unroll_loop(F &&f) { template + typename TResult> double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { + size_t sgSize = get_sg_size(q); range<2> global{rowsA / MCACHE1, (colsB / NCACHE1) * sgSize}; range<2> cachelocal{MCACHE2 / MCACHE1, NCACHE2 / NCACHE1 * sgSize}; @@ -82,12 +83,16 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); - auto mk = q.submit([&](handler &h) { - h.parallel_for( // cache layer#1 + static auto work = [&](handler &h) { + h.parallel_for( // cache layer#1 nd_range<2>{global, cachelocal}, // loop global // loop localrange - [=](nd_item<2> it) [[intel::reqd_sub_group_size(sgSize)]] { + [=](nd_item<2> it) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { auto pA = address_space_cast(A); @@ -290,7 +295,9 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { } // m #endif }); // parallel_for - }); // queue.submit + }; // queue.submit + q.submit(work); + if (i == testIterations - 1) q.wait(); std::chrono::duration duration = diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp index 5be3c485312c2..bc317ffc27d31 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp @@ -23,14 +23,19 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // Matrix API has to be accessed by all the workitems in a // subgroup. These functions will be called once by the subgroup. // No code divergence between the workitems. @@ -57,7 +62,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, N * 2); for (int i = 0; i < JM_ARRAY_SZ; ++i) { @@ -75,7 +80,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, sg, sub_c[i], accC.template get_multi_ptr() + (sg_startx * TM * JM_ARRAY_SZ + TM * i) * N + - sg_starty / SG_SZ * TN, + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp index 8cb6c120d8a34..068506cc63724 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp @@ -19,15 +19,18 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -49,7 +52,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { // joint_matrix_load( @@ -60,14 +63,14 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, N * 2); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp index 91156c3fcc128..36ce0f81f0c63 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp @@ -17,15 +17,18 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -47,7 +50,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { // joint_matrix_load( @@ -59,14 +62,14 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, N * 2); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp index 3f02be1358844..54861eb3b1d3b 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp @@ -23,13 +23,18 @@ void matrix_copy(big_matrix &C, big_matrix &A) { buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -46,13 +51,13 @@ void matrix_copy(big_matrix &C, big_matrix &A) { joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); joint_matrix_copy(sg, sub_c, sub_a); ext::intel::experimental::matrix::joint_matrix_store( sg, sub_a, accA.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp index aad8aeaa5c602..53b4ca7b97412 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half_impl.hpp @@ -27,56 +27,60 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - [[intel::reqd_sub_group_size(SG_SZ)]] { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup - // no code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup + // no code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - // For B, we assume B has been already VNNIed. - joint_matrix - sub_b; - joint_matrix sub_c; + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TM) * K + k * TK, - K); - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (k * TK / 2) * (N * 2) + sg_starty / SG_SZ * TN * 2, - N * 2); - joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); - } - joint_matrix_store( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, - N, layout::row_major); - }); // parallel for + joint_matrix_load( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, + accA.template get_multi_ptr() + + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load( + sg, sub_b, + accB.template get_multi_ptr() + + (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2, + N * 2); + joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); + } + joint_matrix_store( + sg, sub_c, + accC.template get_multi_ptr() + + (sg_startx * TM) * N + sg_starty / sg_size * TN, + N, layout::row_major); + }); // parallel for }).wait(); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp index 96993082d8cb5..625b41f3037b8 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni_impl.hpp @@ -26,15 +26,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -61,14 +65,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK) * N + sg_starty / SG_SZ * TN, + (k * TK) * N + sg_starty / sg_size * TN, N); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp index 031c7753de425..5acc54a412096 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp @@ -15,7 +15,4 @@ // incompatible on the current device #include "common.hpp" - -static constexpr size_t SN = 16; - #include "joint_matrix_opt_kernel_feature_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp index a0b468120ebd3..7aba5911c8386 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp @@ -22,14 +22,19 @@ void matrix_multiply(big_matrix &C, big_matrix &A, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { sycl::accessor accC{bufC, cgh, sycl::read_write}; sycl::accessor accA{bufA, cgh, sycl::read_only}; sycl::accessor accB{bufB, cgh, sycl::read_only}; - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { const auto global_idx = spmd_item.get_global_id(0); const auto global_idy = spmd_item.get_global_id(1); const auto sg_startx = global_idx - spmd_item.get_local_id(0); @@ -44,7 +49,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -56,7 +61,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, sg, sub_b, accB.template get_multi_ptr() + (k * TK / vnniFactor) * (N * vnniFactor) + - sg_starty / SG_SZ * TN * vnniFactor, + sg_starty / sg_size * TN * vnniFactor, N * vnniFactor); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } @@ -95,8 +100,9 @@ int main() { init_and_multiply(); // 500 is not correct size } catch (const sycl::exception &e) { - if (e.code() == errc::kernel_not_supported) + if (e.code() == errc::invalid) return 0; + throw; } return 1; diff --git a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp index ef67ebbd951f3..3e00c667c2505 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_ss_int8_impl.hpp @@ -28,15 +28,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -64,14 +68,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp index 3973a7b516bc8..f8feb25d99229 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_su_int8_impl.hpp @@ -28,15 +28,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -57,7 +61,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -68,14 +72,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp index 2b2fae59cd94d..536fa84581f27 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp @@ -27,15 +27,18 @@ void matrix_multiply(big_matrix &C, buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] - + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif { // The matrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -56,7 +59,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K; k += TK) { joint_matrix_load( @@ -67,7 +70,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k) * (N) + sg_starty / SG_SZ * TN, + (k) * (N) + sg_starty / sg_size * TN, N); // If no rounding to tf32 function is called, joint_matrix_mad // function will work on truncated floats. @@ -81,7 +84,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index 624cfdb256e7d..faea43b062477 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -11,6 +11,9 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; +template +class LS; + template void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, queue q) { @@ -22,47 +25,51 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - - q.submit([&](handler &cgh) { - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { - auto p_input = - address_space_cast(input); - - auto p_out_col_major = - address_space_cast(out_col_major); - auto p_out_row_major = - address_space_cast(out_row_major); - - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_matrix; - - auto row_major_offset = - (sg_startx * TM) * N + (sg_starty / SG_SZ * TN); - auto col_major_offset = - (sg_startx * TM) + (sg_starty / SG_SZ * TN) * M; - - joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, - layout::col_major); - - joint_matrix_store(sg, sub_matrix, - p_out_col_major + row_major_offset, N, - layout::row_major); - - joint_matrix_store(sg, sub_matrix, - p_out_row_major + col_major_offset, M, - layout::col_major); - }); // parallel for - }).wait(); + size_t sg_size = get_sg_size>(q); + + static auto work = [&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto p_input = + address_space_cast(input); + + auto p_out_col_major = + address_space_cast(out_col_major); + auto p_out_row_major = + address_space_cast(out_row_major); + + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_matrix; + + auto row_major_offset = + (sg_startx * TM) * N + (sg_starty / sg_size * TN); + auto col_major_offset = + (sg_startx * TM) + (sg_starty / sg_size * TN) * M; + + joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, + layout::col_major); + + joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset, + N, layout::row_major); + + joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset, + M, layout::col_major); + }); // parallel for + }; + q.submit(work).wait(); } template void run_matrix_test() { diff --git a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp index 5441df5fe2542..db8eda82ef239 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_us_int8_impl.hpp @@ -28,16 +28,18 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] - +#endif { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no @@ -59,7 +61,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -71,14 +73,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp index 4dcb60f4330fc..7e7edb700debb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_uu_int8_impl.hpp @@ -28,15 +28,19 @@ void matrix_multiply(big_matrix &C, buffer bufC(C.get_data(), range<2>(M, N)); queue q; + size_t sg_size = get_sg_size(q); q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); auto accA = bufA.get_access(cgh); auto accB = bufB.get_access(cgh); cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, - K](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] { + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[intel::reqd_sub_group_size(SG_SZ)]] +#endif + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems @@ -57,7 +61,7 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( @@ -69,14 +73,14 @@ void matrix_multiply(big_matrix &C, joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4, + (k * TK / 4) * (N * 4) + sg_starty / sg_size * TN * 4, N * 4); joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c); } joint_matrix_store( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / SG_SZ * TN, + (sg_startx * TM) * N + sg_starty / sg_size * TN, N, layout::row_major); }); // parallel for }).wait(); From a5e15a2e5549bb6c9ce271046e867fca6e9a8c51 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 15 May 2024 15:37:33 -0700 Subject: [PATCH 22/42] clang-format --- sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp | 4 ++-- sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp | 3 +-- 9 files changed, 13 insertions(+), 14 deletions(-) diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp index b424a01a7c6a6..32ceaf8c730a0 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp @@ -39,7 +39,7 @@ void matrix_sum_rows(big_matrix &C, float *sum_rows) { #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp index 8e9880235c2b2..b48e46e18de3d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes_impl.hpp @@ -102,8 +102,8 @@ int init_and_multiply() { big_matrix MBvnni( (Ta *)&Bvnni); - matrix_multiply(MC, MA, MBvnni); + matrix_multiply(MC, MA, MBvnni); matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K); bool res = matrix_compare(MATRIX_M, MATRIX_N, (Tc *)C, (Tc *)D); diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp index 796bdce8d0752..3d3c6304952e5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_bf16_impl.hpp @@ -69,8 +69,8 @@ int main() { matrix_verify_add( MA, 7.0, [=](bfloat16 &x) { x = x + bfloat16(2); }); - matrix_verify_add(MA, 7.0, - apply_add()); + matrix_verify_add( + MA, 7.0, apply_add()); std::cout << "Passed\n"; return 0; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp index bc317ffc27d31..9aefc370bd0c6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp @@ -31,11 +31,11 @@ void matrix_multiply(big_matrix &C, big_matrix &A, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { + { // Matrix API has to be accessed by all the workitems in a // subgroup. These functions will be called once by the subgroup. // No code divergence between the workitems. diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp index 068506cc63724..aef22d35f7d17 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_impl.hpp @@ -27,7 +27,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp index 36ce0f81f0c63..6a7182c41985d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_packedB_impl.hpp @@ -25,7 +25,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp index 54861eb3b1d3b..8ac48511c7e10 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_down_convert_impl.hpp @@ -30,11 +30,11 @@ void matrix_copy(big_matrix &C, big_matrix &A) { cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { + { // The submatrix API has to be accessed by all the workitems in a // subgroup these functions will be called once by the subgroup no // code divergence between the workitems diff --git a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp index 536fa84581f27..69991884c0710 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tf32_impl.hpp @@ -35,7 +35,7 @@ void matrix_multiply(big_matrix &C, cgh.parallel_for( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + [=](nd_item<2> spmd_item) #ifdef SG_SZ [[intel::reqd_sub_group_size(SG_SZ)]] #endif diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index faea43b062477..5de94de6a18ba 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -11,8 +11,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -template -class LS; +template class LS; template void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, From 4cc31ddd4248a5a5558164e4dbc5a4045c075ef6 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Thu, 16 May 2024 08:15:33 -0700 Subject: [PATCH 23/42] Clean up nits --- sycl/test-e2e/Matrix/element_wise_abc_impl.hpp | 1 - sycl/test-e2e/Matrix/get_coord_float_matC.cpp | 4 ---- sycl/test-e2e/Matrix/get_coord_int8_matA.cpp | 4 ---- sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp | 3 --- sycl/test-e2e/Matrix/get_coord_int8_matB.cpp | 4 ---- sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp | 3 --- .../Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_half.cpp | 3 --- .../test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp | 3 --- sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp | 3 --- 11 files changed, 34 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index dea0cf882eaaf..655fa90275f40 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -15,7 +15,6 @@ using namespace sycl::ext::oneapi::experimental::matrix; #define TM 8 #define TK 32 -// class add; template diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp index 57c9a00d98fd4..af7e8e1745781 100644 --- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp +++ b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp @@ -11,10 +11,6 @@ // RUN: %{run} %t.out #include "common.hpp" -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp index 67fa811f2d764..d29217577443e 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp @@ -11,10 +11,6 @@ // RUN: %{run} %t.out #include "common.hpp" -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp index 6f57ab5b4e63c..3f39ebf731801 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp @@ -131,9 +131,6 @@ int main() { big_matrix MA((int8_t *)&A); - size_t NDRangeM = MATRIX_M / TM; - size_t NDRangeK = MATRIX_K / TK; - for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_K; j++) { A[i][j] = i + j; diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp index 782df68d21ff5..ad064fd82fc0a 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp @@ -12,10 +12,6 @@ // XFAIL: cpu #include "common.hpp" -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; constexpr size_t TN = 16; diff --git a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp index 0eb13cf57347c..1478914d1e44f 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_all_sizes.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - // Sub-matrix N dimension static constexpr size_t SN = 16; diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp index 2222cbb605a15..1985bcb6a4fb9 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_bfloat16_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp index 19d12915b4a95..21d5f1239cd8d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp @@ -17,9 +17,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_half.cpp b/sycl/test-e2e/Matrix/joint_matrix_half.cpp index 9281e47f572d2..0bacfa93792d6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_half.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_half.cpp @@ -14,9 +14,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp index fb29cc2baaf74..37769a41f7003 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_colmajorA_colmajorB.cpp @@ -17,9 +17,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_int8_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp index 8dcddb841721d..f592057ce94d5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_int8_vnni.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - constexpr size_t TN = 16; #include "joint_matrix_int8_vnni_impl.hpp" From 498fa1eba53cda1257d1eb6cb6020236b5d1b7d9 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Thu, 16 May 2024 12:22:22 -0700 Subject: [PATCH 24/42] Managed to remove the static code --- .../joint_matrix_bf16_fill_k_cache_impl.hpp | 5 +- .../Matrix/joint_matrix_transposeC_impl.hpp | 83 ++++++++++--------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp index 6a7b0bb369341..e389ea7137428 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp @@ -83,7 +83,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); - static auto work = [&](handler &h) { + q.submit([&](handler &h) { h.parallel_for( // cache layer#1 nd_range<2>{global, cachelocal}, // loop global @@ -295,8 +295,7 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) { } // m #endif }); // parallel_for - }; // queue.submit - q.submit(work); + }); // queue.submit if (i == testIterations - 1) q.wait(); diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp index 5de94de6a18ba..24ba24a264f0d 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp @@ -26,49 +26,50 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major, size_t NDRangeN = N / TN; size_t sg_size = get_sg_size>(q); - static auto work = [&](handler &cgh) { - cgh.parallel_for>( - nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) #ifdef SG_SZ - [[intel::reqd_sub_group_size(SG_SZ)]] + [[intel::reqd_sub_group_size(SG_SZ)]] #endif - { - auto p_input = - address_space_cast(input); - - auto p_out_col_major = - address_space_cast(out_col_major); - auto p_out_row_major = - address_space_cast(out_row_major); - - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_matrix; - - auto row_major_offset = - (sg_startx * TM) * N + (sg_starty / sg_size * TN); - auto col_major_offset = - (sg_startx * TM) + (sg_starty / sg_size * TN) * M; - - joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, - layout::col_major); - - joint_matrix_store(sg, sub_matrix, p_out_col_major + row_major_offset, - N, layout::row_major); - - joint_matrix_store(sg, sub_matrix, p_out_row_major + col_major_offset, - M, layout::col_major); - }); // parallel for - }; - q.submit(work).wait(); + { + auto p_input = + address_space_cast(input); + + auto p_out_col_major = + address_space_cast(out_col_major); + auto p_out_row_major = + address_space_cast(out_row_major); + + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_matrix; + + auto row_major_offset = + (sg_startx * TM) * N + (sg_starty / sg_size * TN); + auto col_major_offset = + (sg_startx * TM) + (sg_starty / sg_size * TN) * M; + + joint_matrix_load(sg, sub_matrix, p_input + col_major_offset, M, + layout::col_major); + + joint_matrix_store(sg, sub_matrix, + p_out_col_major + row_major_offset, N, + layout::row_major); + + joint_matrix_store(sg, sub_matrix, + p_out_row_major + col_major_offset, M, + layout::col_major); + }); // parallel for + }).wait(); } template void run_matrix_test() { From 8ab7f803d58ca589f23080fbf3621877764b92d2 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Fri, 17 May 2024 14:27:28 -0700 Subject: [PATCH 25/42] element_wise_abc --- .../test-e2e/Matrix/SG32/element_wise_abc.cpp | 3 +- .../test-e2e/Matrix/XMX8/element_wise_abc.cpp | 17 ----- sycl/test-e2e/Matrix/element_wise_abc.cpp | 5 +- .../test-e2e/Matrix/element_wise_abc_impl.hpp | 64 +++++++++++++------ 4 files changed, 46 insertions(+), 43 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp index 4833404610369..8b4cd57e4b477 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out @@ -14,6 +14,5 @@ #include "../common.hpp" #define SG_SZ 32 -constexpr size_t TN = 16; #include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp deleted file mode 100644 index d7df42000249a..0000000000000 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp +++ /dev/null @@ -1,17 +0,0 @@ -//==----------- element_wise_abc.cpp - DPC++ joint_matrix------------- ----==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -constexpr size_t TN = 8; - -#include "../element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_abc.cpp b/sycl/test-e2e/Matrix/element_wise_abc.cpp index 0a6a4e4abaa03..8a4e277bf6d11 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/element_wise_abc.cpp @@ -5,13 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -o %t.out // RUN: %{run} %t.out #include "common.hpp" - -constexpr size_t TN = 16; - #include "element_wise_abc_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index 655fa90275f40..e61f747f75cc0 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -13,27 +13,27 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -#define TM 8 -#define TK 32 +template +class add; template void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, big_matrix &B) { - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; + size_t NDRangeM = 1; + size_t NDRangeN = 1; buffer bufA(A.get_data(), range<2>(M, K)); buffer bufB(B.get_data(), range<2>(K, N)); buffer bufC(C.get_data(), range<2>(M, N)); queue q; - size_t sg_size = get_sg_size(q); + size_t sg_size = get_sg_size>(q); q.submit([&](handler &cgh) { accessor accC{bufC, cgh}; accessor accA{bufA, cgh}; accessor accB{bufB, cgh}; - cgh.parallel_for( + cgh.parallel_for>( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), [=](nd_item<2> spmd_item) #ifdef SG_SZ @@ -49,48 +49,72 @@ void matrix_elem_wise_ops(big_matrix &C, big_matrix &A, const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a; + joint_matrix sub_a; // For B, we assume B has been already VNNIed. - joint_matrix + joint_matrix sub_b; - joint_matrix sub_c; + joint_matrix sub_c; joint_matrix_load( sg, sub_a, accA.template get_multi_ptr() + - (sg_startx * TM) * K, + (sg_startx * M) * K, K); joint_matrix_apply(sg, sub_a, [](T2 &x) { x += 1; }); joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - sg_starty / sg_size * TN * vnniFactor, + sg_starty / sg_size * N * vnniFactor, N * vnniFactor); joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; }); joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (sg_startx * TM) * N + sg_starty / sg_size * TN, + (sg_startx * M) * N + sg_starty / sg_size * N, N, layout::row_major); joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; }); }); // parallel for }).wait(); } +template +void test() { + Tc A[TM][TK]; + Tc B[TK / VF][TN * VF]; + Ta C[TM][TN]; + + big_matrix MC((Ta *)&C); + big_matrix MA((Tc *)&A); + big_matrix MB((Tc *)&B); + + return matrix_elem_wise_ops(MC, MA, MB); +} + int main() { - static constexpr unsigned vnniFactor = 4; + queue q; + std::vector combinations = + q.get_device() + .get_info(); - int8_t A[TM][TK]; - int8_t B[TK / vnniFactor][TN * vnniFactor]; - int32_t C[TM][TN]; + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } - big_matrix MC((int32_t *)&C); - big_matrix MA((int8_t *)&A); - big_matrix MB((int8_t *)&B); + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } - matrix_elem_wise_ops(MC, MA, MB); + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } return 0; } From 9049d2faaff73596554b3406702d2614b39067cb Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 07:28:43 -0700 Subject: [PATCH 26/42] WIP: element_wise_all_ops_half --- .../Matrix/SG32/element_wise_all_ops_half.cpp | 1 - .../Matrix/XMX8/element_wise_all_ops_half.cpp | 19 ----- .../Matrix/element_wise_all_ops_half.cpp | 3 - .../Matrix/element_wise_all_ops_half_impl.hpp | 78 +++++++++++++------ 4 files changed, 55 insertions(+), 46 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp index 3916aaff03867..b04fdff3c0819 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp @@ -16,6 +16,5 @@ #include "../common.hpp" #define SG_SZ 32 -constexpr size_t TN = 16; #include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp deleted file mode 100644 index 826b99dfcf306..0000000000000 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp +++ /dev/null @@ -1,19 +0,0 @@ -//==----------- element_wise_all_ops_half.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-fp16 -// REQUIRES: matrix-xmx8 -// REQUIRES: matrix-fp16 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -constexpr size_t TN = 8; - -#include "../element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp index c07d19ed73f2e..e60a6e720cf03 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp @@ -13,7 +13,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -constexpr size_t TN = 16; - #include "element_wise_all_ops_half_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp index c252ed73eb00b..34c79256b4813 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp @@ -5,9 +5,16 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// - -#define TM 8 -#define TK 16 +template +class add; +template +class sub; +template +class mult; +template +class divide; +template +class logic; template void assert_ops_ref(host_accessor C, @@ -60,31 +67,31 @@ void matrix_verify_op(big_matrix &A, const R ref, OP op) { assert_ops_ref(bufA.get_host_access(read_only), ref); } -int main() { - - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - half A[MATRIX_M][MATRIX_N]; - big_matrix MA((half *)&A); +template +void test() { + constexpr size_t MATRIX_M = TM * 2; + constexpr size_t MATRIX_N = TN * 2; + Ta A[MATRIX_M][MATRIX_N]; + big_matrix MA((Ta *)&A); - matrix_verify_op( - MA, 7.0, [=](auto &x) { x = x + static_cast(2); }); - matrix_verify_op( - MA, 3.0, [=](auto &x) { x = x - static_cast(2); }); - matrix_verify_op( - MA, 15.0, [=](auto &x) { x = x * static_cast(3.0); }); - matrix_verify_op( - MA, 2.5, [=](auto &x) { x = x / static_cast(2.0); }); - matrix_verify_op( + matrix_verify_op, Tc>( + MA, 7.0, [=](auto &x) { x = x + static_cast(2); }); + matrix_verify_op, Tc>( + MA, 3.0, [=](auto &x) { x = x - static_cast(2); }); + matrix_verify_op, Tc>( + MA, 15.0, [=](auto &x) { x = x * static_cast(3.0); }); + matrix_verify_op, Tc>( + MA, 2.5, [=](auto &x) { x = x / static_cast(2.0); }); + matrix_verify_op, Tc>( MA, 7.0, [=](auto &x) { if (x) { - if (x > static_cast(2.0) || x >= static_cast(2.0) || - x < static_cast(2.0) || x <= static_cast(2.0)) { - half val = - (x != static_cast(2.0)) ? x : static_cast(2.0); + if (x > static_cast(2.0) || x >= static_cast(2.0) || + x < static_cast(2.0) || x <= static_cast(2.0)) { + Ta val = + (x != static_cast(2.0)) ? x : static_cast(2.0); val--; val++; - if (x == static_cast(2.0)) { + if (x == static_cast(2.0)) { val -= 2; val *= 3; val /= 2; @@ -95,6 +102,31 @@ int main() { } } }); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test(); + break; + } + } return 0; } From 6eae2dacd1a903d8d720dcb89001710d0d327dbd Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 21 May 2024 12:36:37 -0700 Subject: [PATCH 27/42] Expanded tests that use combos --- sycl/test-e2e/Matrix/element_wise_ops_impl.hpp | 1 + sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp | 4 ++++ .../test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp | 2 ++ 3 files changed, 7 insertions(+) diff --git a/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp b/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp index edde026ed877e..8ffbbf8eabff1 100644 --- a/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_ops_impl.hpp @@ -132,6 +132,7 @@ int main() { // These combination are not currently supported for subgroup size = 32 in // IGC passed &= test(); + passed &= test(); passed &= test(); #endif break; diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp index 1e8f58f3dc55d..9751571bcbcf5 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_apply_two_matrices_impl.hpp @@ -139,6 +139,10 @@ int main() { if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc passed &= test(); passed &= test(); +// This combination is not currently supported for sub group size = 32 in IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + passed &= test(); +#endif break; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp index 44b09042b998f..754d49c354d01 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_rowmajorA_rowmajorB_impl.hpp @@ -123,6 +123,8 @@ int main() { int32_t>(); if (combination.nsize == 16) { // architecture::intel_gpu_pvc + res += gemm_row_major<16, 16, 16, class bf16_16x16x16, bfloat16, + bfloat16, float>(); res += gemm_row_major<1, 64, 16, class bf16_1x64x16, bfloat16, bfloat16, float>(); res += gemm_row_major<32, 64, 16, class bf16_32x64x16, bfloat16, From 2a6b455df9fbc1c90620e9d98c2eab5397510f7c Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 22 May 2024 09:18:31 -0700 Subject: [PATCH 28/42] PassedL element_wise_all_ops_int8_packed --- .../SG32/element_wise_all_ops_int8_packed.cpp | 1 - .../XMX8/element_wise_all_ops_int8_packed.cpp | 19 ---- .../element_wise_all_ops_int8_packed.cpp | 3 - .../element_wise_all_ops_int8_packed_impl.hpp | 94 +++++++++++++------ 4 files changed, 63 insertions(+), 54 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp index ad644c8734475..2d62023ad7d01 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -16,6 +16,5 @@ #include "../common.hpp" #define SG_SZ 32 -constexpr size_t TN = 16; #include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp deleted file mode 100644 index 9ff39c8d516d0..0000000000000 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp +++ /dev/null @@ -1,19 +0,0 @@ -//==------ element_wise_all_ops_int8_packed.cpp - DPC++ joint_matrix-------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// This test stores the matrix B that is VNNIed (packed). - -#include "../common.hpp" - -constexpr size_t TN = 8; - -#include "../element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp index 24f82f47e8fcd..43370673c75f7 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp @@ -13,7 +13,4 @@ // This test stores the matrix B that is VNNIed (packed). #include "common.hpp" - -constexpr size_t TN = 16; - #include "element_wise_all_ops_int8_packed_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp index 063ffe9717f83..953531927ce83 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp @@ -6,8 +6,11 @@ // //===----------------------------------------------------------------------===// -#define TM 8 -#define TK 32 +template class add; +template class sub; +template class mul; +template class divide; +template class logic; template void assert_ops_ref(host_accessor C, const R ref) { @@ -61,40 +64,69 @@ void matrix_verify_op(big_matrix &B, const R ref, OP op) { assert_ops_ref(bufB.get_host_access(read_only), ref); } -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -int8_t B[MATRIX_M][MATRIX_N]; - -int main() { +template void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + int8_t B[MATRIX_M][MATRIX_N]; big_matrix MB((int8_t *)&B); - matrix_verify_op( - MB, 7, [=](auto &x) { x = x + 2; }); - matrix_verify_op( - MB, 3, [=](auto &x) { x = x - 2; }); - matrix_verify_op( - MB, 10, [=](auto &x) { x = x * 2; }); - matrix_verify_op( + matrix_verify_op, int>(MB, 7, + [=](auto &x) { x = x + 2; }); + matrix_verify_op, int>(MB, 3, + [=](auto &x) { x = x - 2; }); + matrix_verify_op, int>(MB, 10, + [=](auto &x) { x = x * 2; }); + matrix_verify_op, int>( MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op( - MB, 7, [=](auto &x) { - if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - int8_t val = (x != 2) ? x : 2; - val--; - val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; - } else { - val += 2; - } - x = val; - } + matrix_verify_op, int>(MB, 7, [=](auto &x) { + if (x) { + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + int8_t val = (x != 2) ? x : 2; + val--; + val++; + if (x == 2) { + val -= 2; + val *= 3; + val /= 2; + } else { + val += 2; } - }); + x = val; + } + } + }); +} + +int main() { + + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test<16, 16, 64>(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test<8, 16, 32>(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test<8, 8, 32>(); + break; + } + } return 0; } From c88783b5e1727c936c30320aa57989bd83a68ff6 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 22 May 2024 09:26:42 -0700 Subject: [PATCH 29/42] Pass: element_wise_all_ops_int8 --- .../Matrix/SG32/element_wise_all_ops_int8.cpp | 1 - .../Matrix/XMX8/element_wise_all_ops_int8.cpp | 17 ---- .../Matrix/element_wise_all_ops_int8.cpp | 3 - .../Matrix/element_wise_all_ops_int8_impl.hpp | 92 ++++++++++++------- .../element_wise_all_ops_int8_packed_impl.hpp | 26 +++--- 5 files changed, 71 insertions(+), 68 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp index ddfa39c541c0a..fbc965df97a46 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp @@ -14,6 +14,5 @@ #include "../common.hpp" #define SG_SZ 32 -constexpr size_t TN = 16; #include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp deleted file mode 100644 index a39cb6664d100..0000000000000 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp +++ /dev/null @@ -1,17 +0,0 @@ -//==----------- element_wise_all_ops_int8.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -constexpr size_t TN = 8; - -#include "../element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp index e1a2cf4eecfa1..11b488a8298ca 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp @@ -11,7 +11,4 @@ // RUN: %{run} %t.out #include "common.hpp" - -constexpr size_t TN = 16; - #include "element_wise_all_ops_int8_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp index c5025543bfd78..b60d24f00f769 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp @@ -6,8 +6,11 @@ // //===----------------------------------------------------------------------===// -#define TM 8 -#define TK 32 +template class add; +template class sub; +template class mul; +template class divide; +template class logic; template void assert_ops_ref(host_accessor C, const R ref) { @@ -59,40 +62,65 @@ void matrix_verify_op(big_matrix &A, const R ref, OP op) { assert_ops_ref(bufA.get_host_access(read_only), ref); } -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -int8_t A[MATRIX_M][MATRIX_N]; - -int main() { +template void test() { + static constexpr size_t MATRIX_M = TM * 2; + static constexpr size_t MATRIX_N = TN * 2; + int8_t A[MATRIX_M][MATRIX_N]; big_matrix MA((int8_t *)&A); - matrix_verify_op( - MA, 7, [=](auto &x) { x = x + 2; }); - matrix_verify_op( - MA, 3, [=](auto &x) { x = x - 2; }); - matrix_verify_op( - MA, 10, [=](auto &x) { x = x * 2; }); - matrix_verify_op( - MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op( - MA, 7, [=](auto &x) { - if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - int8_t val = (x != 2) ? x : 2; - val--; - val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; - } else { - val += 2; - } - x = val; - } + matrix_verify_op, + int>(MA, 7, [=](auto &x) { x = x + 2; }); + matrix_verify_op, + int>(MA, 3, [=](auto &x) { x = x - 2; }); + matrix_verify_op, + int>(MA, 10, [=](auto &x) { x = x * 2; }); + matrix_verify_op, + int>(MA, 2, + [=](auto &x) { x = x / 2; }); // truncation is expected + matrix_verify_op, + int>(MA, 7, [=](auto &x) { + if (x) { + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + int8_t val = (x != 2) ? x : 2; + val--; + val++; + if (x == 2) { + val -= 2; + val *= 3; + val /= 2; + } else { + val += 2; } - }); + x = val; + } + } + }); +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test<16, 16, 64>(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test<8, 16, 32>(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test<8, 8, 32>(); + break; + } + } return 0; } diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp index 953531927ce83..a3ed8d73bace2 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp @@ -71,20 +71,17 @@ template void test() { big_matrix MB((int8_t *)&B); - matrix_verify_op, int>(MB, 7, - [=](auto &x) { x = x + 2; }); - matrix_verify_op, int>(MB, 3, - [=](auto &x) { x = x - 2; }); - matrix_verify_op, int>(MB, 10, - [=](auto &x) { x = x * 2; }); - matrix_verify_op, int>( - MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op, int>(MB, 7, [=](auto &x) { + matrix_verify_op, + int>(MB, 7, [=](auto &x) { x = x + 2; }); + matrix_verify_op, + int>(MB, 3, [=](auto &x) { x = x - 2; }); + matrix_verify_op, + int>(MB, 10, [=](auto &x) { x = x * 2; }); + matrix_verify_op, + int>(MB, 2, + [=](auto &x) { x = x / 2; }); // truncation is expected + matrix_verify_op, + int>(MB, 7, [=](auto &x) { if (x) { if (x > 2 || x >= 2 || x < 2 || x <= 2) { int8_t val = (x != 2) ? x : 2; @@ -104,7 +101,6 @@ template void test() { } int main() { - queue q; std::vector combinations = q.get_device() From 317e3c20bef0b912539935a26de511979f51f840 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 22 May 2024 10:16:03 -0700 Subject: [PATCH 30/42] Pass: element_wise_all_sizes --- .../Matrix/SG32/element_wise_all_ops_int8.cpp | 2 +- .../Matrix/XMX8/element_wise_all_sizes.cpp | 18 ----- .../XMX8/element_wise_all_sizes_no_split.cpp | 3 - .../Matrix/element_wise_all_sizes.cpp | 2 +- .../Matrix/element_wise_all_sizes_impl.hpp | 65 +++++++++++++------ 5 files changed, 48 insertions(+), 42 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp index fbc965df97a46..4f71059c759b7 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp deleted file mode 100644 index 5bae6a3184808..0000000000000 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//==----------- element_wise_all_sizes.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix-xmx8 -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -constexpr size_t TN = 8; - -#include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp index 87adf891cd16b..3de741d8be76f 100644 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp +++ b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp @@ -15,7 +15,4 @@ // RUN: %{run} %t.out #include "../common.hpp" - -constexpr size_t TN = 8; - #include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp index 1c07e494fcc47..ef13dcd6c640c 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp index 5800ab9c62745..e324f1fcd30af 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// static constexpr size_t M_MULTIPLIER = 16; +template class add; template void assert_ops_ref(host_accessor C, @@ -23,7 +24,7 @@ void assert_ops_ref(host_accessor C, } } -template +template void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { static constexpr size_t M = TM * M_MULTIPLIER; static constexpr size_t K = 128; @@ -69,35 +70,61 @@ void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { assert_ops_ref(bufA.get_host_access(), result); } -template +template void add_ref() { if constexpr (std::is_same_v) { // Tests whether 5 + 2 = 7 operation is successful. - matrix_verify_add( + matrix_verify_add( bfloat16(5.0), bfloat16(2.0), bfloat16(7.0)); } if constexpr (std::is_same_v) { - matrix_verify_add(5 /*val1*/, 2 /*val2*/, + matrix_verify_add(5 /*val1*/, 2 /*val2*/, 7 /*result*/); } } +template +void test() { + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); + add_ref>(); +} + int main() { - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); - add_ref(); + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test<16>(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test<16>(); + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test<8>(); + break; + } + } std::cout << "Passed\n"; } From 7d51e9cd79d6c0c9a8d6a2eca0fd93ebafdd90b1 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 22 May 2024 10:28:44 -0700 Subject: [PATCH 31/42] clang-format and nits --- .../Matrix/SG32/element_wise_all_ops_half.cpp | 3 +-- .../SG32/element_wise_all_ops_int8_packed.cpp | 2 +- sycl/test-e2e/Matrix/element_wise_abc_impl.hpp | 3 +-- .../Matrix/element_wise_all_ops_half.cpp | 3 +-- .../Matrix/element_wise_all_ops_half_impl.hpp | 18 ++++++------------ .../Matrix/element_wise_all_ops_int8.cpp | 2 +- .../element_wise_all_ops_int8_packed.cpp | 2 +- .../Matrix/element_wise_all_sizes_impl.hpp | 10 +++++----- 8 files changed, 17 insertions(+), 26 deletions(-) diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp index b04fdff3c0819..99473c896628b 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp @@ -6,8 +6,7 @@ // //===----------------------------------------------------------------------===// // REQUIRES: aspect-fp16 -// REQUIRES: matrix,gpu -// REQUIRES: matrix-fp16 +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp index 2d62023ad7d01..5f9f2809bf3ff 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp index e61f747f75cc0..5caf6d3e0a3e7 100644 --- a/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_abc_impl.hpp @@ -13,8 +13,7 @@ using namespace sycl; using namespace sycl::ext::oneapi::experimental::matrix; -template -class add; +template class add; template diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp index e60a6e720cf03..f97241f275bd1 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp @@ -6,8 +6,7 @@ // //===----------------------------------------------------------------------===// // REQUIRES: aspect-fp16 -// REQUIRES: matrix -// REQUIRES: matrix-fp16 +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp index 34c79256b4813..04e3d516a491a 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp @@ -5,16 +5,11 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -template -class add; -template -class sub; -template -class mult; -template -class divide; -template -class logic; +template class add; +template class sub; +template class mult; +template class divide; +template class logic; template void assert_ops_ref(host_accessor C, @@ -87,8 +82,7 @@ void test() { if (x) { if (x > static_cast(2.0) || x >= static_cast(2.0) || x < static_cast(2.0) || x <= static_cast(2.0)) { - Ta val = - (x != static_cast(2.0)) ? x : static_cast(2.0); + Ta val = (x != static_cast(2.0)) ? x : static_cast(2.0); val--; val++; if (x == static_cast(2.0)) { diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp index 11b488a8298ca..ba538e4ebffef 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp index 43370673c75f7..be27718279b79 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix +// REQUIRES: aspect-ext_intel_matrix // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp index e324f1fcd30af..5228a154e9f6f 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes_impl.hpp @@ -24,7 +24,8 @@ void assert_ops_ref(host_accessor C, } } -template +template void matrix_verify_add(const T1 val1, const T1 val2, const T1 result) { static constexpr size_t M = TM * M_MULTIPLIER; static constexpr size_t K = 128; @@ -78,13 +79,12 @@ void add_ref() { bfloat16(5.0), bfloat16(2.0), bfloat16(7.0)); } if constexpr (std::is_same_v) { - matrix_verify_add(5 /*val1*/, 2 /*val2*/, - 7 /*result*/); + matrix_verify_add( + 5 /*val1*/, 2 /*val2*/, 7 /*result*/); } } -template -void test() { +template void test() { add_ref>(); add_ref>(); add_ref>(); From df1be4f5b874260a9557b4a4890632fac27cae16 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Wed, 22 May 2024 14:46:02 -0700 Subject: [PATCH 32/42] fixed requires --- sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp | 2 ++ sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp | 4 +++- sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp | 2 ++ .../test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp | 2 ++ 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp index 8b4cd57e4b477..1fdd989ae091f 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp @@ -5,6 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 // REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp index 99473c896628b..847f4a7812aa2 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp @@ -5,8 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 // REQUIRES: aspect-fp16 -// REQUIRES: aspect-ext_intel_matrix +// REQUIRES: aspect-ext_intel_matrix, gpu // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp index 4f71059c759b7..984ff9a9b082f 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp @@ -5,6 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 // REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 diff --git a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp index 5f9f2809bf3ff..af2f4df82b648 100644 --- a/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp +++ b/sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8_packed.cpp @@ -5,6 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// UNSUPPORTED: gpu-intel-dg2 // REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 From ced84eb054a30c49ff4bf1b43fffc782b4911059 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Thu, 23 May 2024 12:54:44 -0700 Subject: [PATCH 33/42] fixed element_wise_all_ops_int8 --- .../Matrix/element_wise_all_ops_int8_impl.hpp | 109 +++++++++--------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp index b60d24f00f769..8c39c2f132d0a 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp @@ -6,33 +6,33 @@ // //===----------------------------------------------------------------------===// -template class add; -template class sub; -template class mul; -template class divide; -template class logic; +template class add; +template class sub; +template class mul; +template class divide; +template class logic; -template +template void assert_ops_ref(host_accessor C, const R ref) { - for (size_t i = 0; i < M; i++) - for (size_t j = 0; j < N; j++) { + for (size_t i = 0; i < Rows; i++) + for (size_t j = 0; j < Cols; j++) { auto diff = C[i][j] - ref; assert(std::fabs(static_cast(diff)) <= std::numeric_limits::epsilon()); } } -template -void matrix_verify_op(big_matrix &A, const R ref, OP op) { - buffer bufA(A.get_data(), range<2>(M, N)); +template +void matrix_verify_op(big_matrix &A, const R ref, OP op) { + buffer bufA(A.get_data(), range<2>(Rows, Cols)); queue q; size_t sg_size = get_sg_size(q); - nd_range<2> r({M / TileM, N / TileN * sg_size}, {1, 1 * sg_size}); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); q.submit([&](handler &cgh) { - auto accA = bufA.get_access(cgh); + sycl::accessor accA{bufA, cgh, sycl::read_write}; cgh.parallel_for( r, [=](nd_item<2> spmd_item) @@ -46,7 +46,8 @@ void matrix_verify_op(big_matrix &A, const R ref, OP op) { const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix + joint_matrix sub_a; joint_matrix_fill(sg, sub_a, 5); @@ -55,47 +56,47 @@ void matrix_verify_op(big_matrix &A, const R ref, OP op) { ext::intel::experimental::matrix::joint_matrix_store( sg, sub_a, accA.template get_multi_ptr() + - (sg_startx * TileM) * N + sg_starty / sg_size * TileN, - N); + (sg_startx * TileRows) * Cols + + sg_starty / sg_size * TileCols, + Cols); }); // parallel for }).wait(); - assert_ops_ref(bufA.get_host_access(read_only), ref); + assert_ops_ref(bufA.get_host_access(read_only), ref); } -template void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - int8_t A[MATRIX_M][MATRIX_N]; - - big_matrix MA((int8_t *)&A); - - matrix_verify_op, - int>(MA, 7, [=](auto &x) { x = x + 2; }); - matrix_verify_op, - int>(MA, 3, [=](auto &x) { x = x - 2; }); - matrix_verify_op, - int>(MA, 10, [=](auto &x) { x = x * 2; }); - matrix_verify_op, - int>(MA, 2, - [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op, - int>(MA, 7, [=](auto &x) { - if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - int8_t val = (x != 2) ? x : 2; - val--; - val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; - } else { - val += 2; +template void test() { + static constexpr size_t Rows = TM * 2; + static constexpr size_t Cols = TK * 2; + Ta A[Rows][Cols]; + + big_matrix MA((Ta *)&A); + + matrix_verify_op, TResult>( + MA, 7, [=](auto &x) { x = x + 2; }); + matrix_verify_op, TResult>( + MA, 3, [=](auto &x) { x = x - 2; }); + matrix_verify_op, TResult>( + MA, 10, [=](auto &x) { x = x * 2; }); + matrix_verify_op, TResult>( + MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected + matrix_verify_op, TResult>( + MA, 7, [=](auto &x) { + if (x) { + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + Ta val = (x != 2) ? x : 2; + val--; + val++; + if (x == 2) { + val -= 2; + val *= 3; + val /= 2; + } else { + val += 2; + } + x = val; + } } - x = val; - } - } - }); + }); } int main() { @@ -107,17 +108,17 @@ int main() { for (unsigned int i = 0; i < combinations.size(); i++) { if (combinations[i].nsize == 0) { // Intel AMX - test<16, 16, 64>(); + test(); break; } if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test<8, 16, 32>(); + test(); break; } if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test<8, 8, 32>(); + test(); break; } } From 8b0e59debf7835de7ed26ae06419edd90a57f938 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Mon, 27 May 2024 10:34:42 -0700 Subject: [PATCH 34/42] CPU works element_wise_all_ops_half --- .../Matrix/element_wise_all_ops_half.cpp | 1 + .../Matrix/element_wise_all_ops_half_impl.hpp | 94 +++++++++---------- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp index f97241f275bd1..148f54e44bedc 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// REQUIRES: matrix-fp16 // REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp index 04e3d516a491a..0cfedc62ff425 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp @@ -5,34 +5,33 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -template class add; -template class sub; -template class mult; -template class divide; -template class logic; - -template -void assert_ops_ref(host_accessor C, - const float ref) { - for (size_t i = 0; i < M; i++) - for (size_t j = 0; j < N; j++) { +template class add; +template class sub; +template class mul; +template class divide; +template class logic; + +template +void assert_ops_ref(host_accessor C, const R ref) { + for (size_t i = 0; i < Rows; i++) + for (size_t j = 0; j < Cols; j++) { auto diff = C[i][j] - ref; - assert(std::fabs(static_cast(diff)) < + assert(std::fabs(static_cast(diff)) <= std::numeric_limits::epsilon()); } } -template -void matrix_verify_op(big_matrix &A, const R ref, OP op) { - buffer bufA(A.get_data(), range<2>(M, N)); +template +void matrix_verify_op(big_matrix &A, const R ref, OP op) { + buffer bufA(A.get_data(), range<2>(Rows, Cols)); queue q; size_t sg_size = get_sg_size(q); - nd_range<2> r({M / TileM, N / TileN * sg_size}, {1, 1 * sg_size}); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); q.submit([&](handler &cgh) { - auto accA = bufA.get_access(cgh); + sycl::accessor accA{bufA, cgh, sycl::read_write}; cgh.parallel_for( r, [=](nd_item<2> spmd_item) @@ -46,7 +45,8 @@ void matrix_verify_op(big_matrix &A, const R ref, OP op) { const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix + joint_matrix sub_a; joint_matrix_fill(sg, sub_a, 5); @@ -55,37 +55,37 @@ void matrix_verify_op(big_matrix &A, const R ref, OP op) { ext::intel::experimental::matrix::joint_matrix_store( sg, sub_a, accA.template get_multi_ptr() + - (sg_startx * TileM) * N + sg_starty / sg_size * TileN, - N); + (sg_startx * TileRows) * Cols + + sg_starty / sg_size * TileCols, + Cols); }); // parallel for }).wait(); - assert_ops_ref(bufA.get_host_access(read_only), ref); + assert_ops_ref(bufA.get_host_access(read_only), ref); } -template -void test() { - constexpr size_t MATRIX_M = TM * 2; - constexpr size_t MATRIX_N = TN * 2; - Ta A[MATRIX_M][MATRIX_N]; - big_matrix MA((Ta *)&A); - - matrix_verify_op, Tc>( - MA, 7.0, [=](auto &x) { x = x + static_cast(2); }); - matrix_verify_op, Tc>( - MA, 3.0, [=](auto &x) { x = x - static_cast(2); }); - matrix_verify_op, Tc>( - MA, 15.0, [=](auto &x) { x = x * static_cast(3.0); }); - matrix_verify_op, Tc>( - MA, 2.5, [=](auto &x) { x = x / static_cast(2.0); }); - matrix_verify_op, Tc>( - MA, 7.0, [=](auto &x) { +template void test() { + static constexpr size_t Rows = TM * 2; + static constexpr size_t Cols = TK * 2; + Ta A[Rows][Cols]; + + big_matrix MA((Ta *)&A); + + matrix_verify_op, TResult>( + MA, 7, [=](auto &x) { x = x + 2; }); + matrix_verify_op, TResult>( + MA, 3, [=](auto &x) { x = x - 2; }); + matrix_verify_op, TResult>( + MA, 10, [=](auto &x) { x = x * 2; }); + matrix_verify_op, TResult>( + MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected + matrix_verify_op, TResult>( + MA, 7, [=](auto &x) { if (x) { - if (x > static_cast(2.0) || x >= static_cast(2.0) || - x < static_cast(2.0) || x <= static_cast(2.0)) { - Ta val = (x != static_cast(2.0)) ? x : static_cast(2.0); + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + Ta val = (x != 2) ? x : 2; val--; val++; - if (x == static_cast(2.0)) { + if (x == 2) { val -= 2; val *= 3; val /= 2; @@ -107,20 +107,20 @@ int main() { for (unsigned int i = 0; i < combinations.size(); i++) { if (combinations[i].nsize == 0) { // Intel AMX - test(); + test(); break; } if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); + test(); break; } if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); + test(); break; } } return 0; -} +} \ No newline at end of file From 925c241fbb2c309c500a2a3b40b1e0710a8aca00 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Mon, 27 May 2024 13:54:55 -0700 Subject: [PATCH 35/42] CPU pass element_wise_all_ops_int8_packed --- .../element_wise_all_ops_int8_packed_impl.hpp | 115 +++++++++--------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp index a3ed8d73bace2..c4a058db808de 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp @@ -6,33 +6,34 @@ // //===----------------------------------------------------------------------===// -template class add; -template class sub; -template class mul; -template class divide; -template class logic; +template class add; +template class sub; +template class mul; +template class divide; +template class logic; -template +template void assert_ops_ref(host_accessor C, const R ref) { - for (size_t i = 0; i < M; i++) - for (size_t j = 0; j < N; j++) { - auto diff = C[i][j] - ref; + for (size_t i = 0; i < Rows; i++) + for (size_t j = 0; j < Cols; j++) { + R diff = C[i][j] - ref; assert(std::fabs(static_cast(diff)) <= std::numeric_limits::epsilon()); } } -template -void matrix_verify_op(big_matrix &B, const R ref, OP op) { - buffer bufB(B.get_data(), range<2>(M, N)); +template +void matrix_verify_op(big_matrix &B, const R ref, OP op) { + buffer bufB(B.get_data(), range<2>(Rows, Cols)); queue q; size_t sg_size = get_sg_size(q); - nd_range<2> r({M / TileM, N / TileN * sg_size}, {1, 1 * sg_size}); + nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); q.submit([&](handler &cgh) { - auto accB = bufB.get_access(cgh); + sycl::accessor accB{bufB, cgh, sycl::read_write}; cgh.parallel_for( r, [=](nd_item<2> spmd_item) @@ -46,7 +47,7 @@ void matrix_verify_op(big_matrix &B, const R ref, OP op) { const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_b; @@ -56,48 +57,48 @@ void matrix_verify_op(big_matrix &B, const R ref, OP op) { ext::intel::experimental::matrix::joint_matrix_store( sg, sub_b, accB.template get_multi_ptr() + - (sg_startx * TileM) * N * 4 + - sg_starty / sg_size * TileN * 4, - N * 4); + (sg_startx * TileRows / VNNI) * Cols * VNNI + + sg_starty / sg_size * TileCols * VNNI, + Cols * VNNI); }); // parallel for }).wait(); - assert_ops_ref(bufB.get_host_access(read_only), ref); + assert_ops_ref(bufB.get_host_access(read_only), ref); } -template void test() { - static constexpr size_t MATRIX_M = TM * 2; - static constexpr size_t MATRIX_N = TN * 2; - int8_t B[MATRIX_M][MATRIX_N]; - - big_matrix MB((int8_t *)&B); - - matrix_verify_op, - int>(MB, 7, [=](auto &x) { x = x + 2; }); - matrix_verify_op, - int>(MB, 3, [=](auto &x) { x = x - 2; }); - matrix_verify_op, - int>(MB, 10, [=](auto &x) { x = x * 2; }); - matrix_verify_op, - int>(MB, 2, - [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op, - int>(MB, 7, [=](auto &x) { - if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - int8_t val = (x != 2) ? x : 2; - val--; - val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; - } else { - val += 2; +template +void test() { + static constexpr size_t Rows = TK * 2; + static constexpr size_t Cols = TN * 2; + Ta B[Rows][Cols]; + + big_matrix MB((Ta *)&B); + + matrix_verify_op, TResult>( + MB, 7, [=](auto &x) { x = x + 2; }); + matrix_verify_op, TResult>( + MB, 3, [=](auto &x) { x = x - 2; }); + matrix_verify_op, TResult>( + MB, 10, [=](auto &x) { x = x * 2; }); + matrix_verify_op, TResult>( + MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected + matrix_verify_op, TResult>( + MB, 7, [=](auto &x) { + if (x) { + if (x > 2 || x >= 2 || x < 2 || x <= 2) { + Ta val = (x != 2) ? x : 2; + val--; + val++; + if (x == 2) { + val -= 2; + val *= 3; + val /= 2; + } else { + val += 2; + } + x = val; + } } - x = val; - } - } - }); + }); } int main() { @@ -108,18 +109,18 @@ int main() { matrix_combinations>(); for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test<16, 16, 64>(); + if (combinations[i].nsize == 0) { // Intel AMX + test(); // should work break; } if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test<8, 16, 32>(); + test(); break; } if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test<8, 8, 32>(); + test(); break; } } From 894d5f8a1e8bead6b862078a3b5afa455a3aafa3 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Mon, 27 May 2024 13:58:53 -0700 Subject: [PATCH 36/42] removed XMX8/element_wise_all_sizes_no_split --- .../XMX8/element_wise_all_sizes_no_split.cpp | 18 ------------------ .../test-e2e/Matrix/element_wise_all_sizes.cpp | 3 +++ 2 files changed, 3 insertions(+), 18 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp diff --git a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp b/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp deleted file mode 100644 index 3de741d8be76f..0000000000000 --- a/sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//==-------- element_wise_all_sizes_no_split.cpp - DPC++ joint_matrix------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// This is a version of element_wise_all_sizes test with disabled device code -// split to test against fixed bug in IGC - -// REQUIRES: matrix-xmx8 -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -fsycl-device-code-split=off -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" -#include "../element_wise_all_sizes_impl.hpp" diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp index ef13dcd6c640c..7999904ba7659 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp @@ -11,5 +11,8 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// RUN: %{build} -fsycl-device-code-split=off -o %t_split.out +// RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} + #include "common.hpp" #include "element_wise_all_sizes_impl.hpp" From 0a6a182bd67703827fd413243a16bcf47aa18c1b Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Mon, 27 May 2024 14:40:01 -0700 Subject: [PATCH 37/42] clang-format --- sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp index 8c39c2f132d0a..8a2f1f495e41d 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_impl.hpp @@ -32,7 +32,7 @@ void matrix_verify_op(big_matrix &A, const R ref, OP op) { nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; + sycl::accessor accA{bufA, cgh, sycl::read_write}; cgh.parallel_for( r, [=](nd_item<2> spmd_item) From 72f021e36063a0933969ea7eef6ab8175f40caa0 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 28 May 2024 08:03:19 -0700 Subject: [PATCH 38/42] Typo in /element_wise_all_ops_half --- .../Matrix/element_wise_all_ops_half_impl.hpp | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp index 0cfedc62ff425..407433c0d3032 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp @@ -71,26 +71,27 @@ template void test() { big_matrix MA((Ta *)&A); matrix_verify_op, TResult>( - MA, 7, [=](auto &x) { x = x + 2; }); + MA, 7, [=](Ta &x) { x = x + static_cast(2); }); matrix_verify_op, TResult>( - MA, 3, [=](auto &x) { x = x - 2; }); + MA, 3, [=](Ta &x) { x = x - static_cast(2); }); matrix_verify_op, TResult>( - MA, 10, [=](auto &x) { x = x * 2; }); + MA, 10, [=](Ta &x) { x = x * static_cast(2); }); matrix_verify_op, TResult>( - MA, 2, [=](auto &x) { x = x / 2; }); // truncation is expected + MA, 2.5, [=](Ta &x) { x = x / static_cast(2); }); matrix_verify_op, TResult>( - MA, 7, [=](auto &x) { + MA, 7, [=](Ta &x) { if (x) { - if (x > 2 || x >= 2 || x < 2 || x <= 2) { - Ta val = (x != 2) ? x : 2; + if (x > static_cast(2) || x >= static_cast(2) || + x < static_cast(2) || x <= static_cast(2)) { + Ta val = (x != static_cast(2)) ? x : static_cast(2); val--; val++; - if (x == 2) { - val -= 2; - val *= 3; - val /= 2; + if (x == static_cast(2)) { + val -= static_cast(2); + val *= static_cast(3); + val /= static_cast(2); } else { - val += 2; + val += static_cast(2); } x = val; } @@ -107,17 +108,17 @@ int main() { for (unsigned int i = 0; i < combinations.size(); i++) { if (combinations[i].nsize == 0) { // Intel AMX - test(); + test(); break; } if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); + test(); break; } if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); + test(); break; } } From 587137c67297ff3c3d065fbc7d5069c20afac770 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 28 May 2024 14:51:42 -0700 Subject: [PATCH 39/42] nits --- .../Matrix/element_wise_all_ops_half.cpp | 2 +- .../Matrix/element_wise_all_ops_half_impl.hpp | 4 +- .../element_wise_all_ops_int8_packed_impl.hpp | 37 ++++++++++--------- .../Matrix/element_wise_all_sizes.cpp | 1 + 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp index 148f54e44bedc..bb651568f9251 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: matrix-fp16 // REQUIRES: aspect-fp16 // REQUIRES: aspect-ext_intel_matrix +// REQUIRES: matrix-fp16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp index 407433c0d3032..4065c7a78a566 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_half_impl.hpp @@ -16,7 +16,7 @@ void assert_ops_ref(host_accessor C, const R ref) { for (size_t i = 0; i < Rows; i++) for (size_t j = 0; j < Cols; j++) { auto diff = C[i][j] - ref; - assert(std::fabs(static_cast(diff)) <= + assert(std::fabs(static_cast(diff)) < std::numeric_limits::epsilon()); } } @@ -124,4 +124,4 @@ int main() { } return 0; -} \ No newline at end of file +} diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp index c4a058db808de..c6683f9657c4a 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp @@ -12,20 +12,21 @@ template class mul; template class divide; template class logic; -template -void assert_ops_ref(host_accessor C, const R ref) { +template +void assert_ops_ref(host_accessor C, + const TResult ref) { for (size_t i = 0; i < Rows; i++) for (size_t j = 0; j < Cols; j++) { - R diff = C[i][j] - ref; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); + TResult diff = C[i][j] - ref; + assert(std::fabs(static_cast(diff)) <= + std::numeric_limits::epsilon()); } } template -void matrix_verify_op(big_matrix &B, const R ref, OP op) { +void matrix_verify_op(big_matrix &B, const TResult ref, OP op) { buffer bufB(B.get_data(), range<2>(Rows, Cols)); queue q; @@ -62,30 +63,30 @@ void matrix_verify_op(big_matrix &B, const R ref, OP op) { Cols * VNNI); }); // parallel for }).wait(); - assert_ops_ref(bufB.get_host_access(read_only), ref); + assert_ops_ref(bufB.get_host_access(read_only), ref); } -template +template void test() { static constexpr size_t Rows = TK * 2; static constexpr size_t Cols = TN * 2; - Ta B[Rows][Cols]; + T B[Rows][Cols]; - big_matrix MB((Ta *)&B); + big_matrix MB((T *)&B); - matrix_verify_op, TResult>( + matrix_verify_op, TResult>( MB, 7, [=](auto &x) { x = x + 2; }); - matrix_verify_op, TResult>( + matrix_verify_op, TResult>( MB, 3, [=](auto &x) { x = x - 2; }); - matrix_verify_op, TResult>( + matrix_verify_op, TResult>( MB, 10, [=](auto &x) { x = x * 2; }); - matrix_verify_op, TResult>( + matrix_verify_op, TResult>( MB, 2, [=](auto &x) { x = x / 2; }); // truncation is expected - matrix_verify_op, TResult>( + matrix_verify_op, TResult>( MB, 7, [=](auto &x) { if (x) { if (x > 2 || x >= 2 || x < 2 || x <= 2) { - Ta val = (x != 2) ? x : 2; + T val = (x != 2) ? x : 2; val--; val++; if (x == 2) { @@ -110,7 +111,7 @@ int main() { for (unsigned int i = 0; i < combinations.size(); i++) { if (combinations[i].nsize == 0) { // Intel AMX - test(); // should work + test(); break; } diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp index 7999904ba7659..5f743cfe73b3d 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp @@ -11,6 +11,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// This is a version of the test with disabled device code // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} From 7cabdc732c7eb073e8a5e4fb36a73fa0d4a90299 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 28 May 2024 21:50:36 -0700 Subject: [PATCH 40/42] Update sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp Co-authored-by: Yury Plyakhin --- sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp index c6683f9657c4a..7336bb8467fa5 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp @@ -110,7 +110,7 @@ int main() { matrix_combinations>(); for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX + if (combinations[i].nsize == 0) { // Intel AMX test(); break; } From 0ed46c94abd52f9162a7b1dd41ba0c625e210529 Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 28 May 2024 21:51:20 -0700 Subject: [PATCH 41/42] Update sycl/test-e2e/Matrix/element_wise_all_sizes.cpp Co-authored-by: Yury Plyakhin --- sycl/test-e2e/Matrix/element_wise_all_sizes.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp index 5f743cfe73b3d..e3da4f7d130eb 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp @@ -12,6 +12,8 @@ // RUN: %{run} %t.out // This is a version of the test with disabled device code +// This is a version of the test with disabled device code +// split to test against fixed bug in IGC // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out // RUN: %if gpu-intel-dg2 %{ %{run} %t_split.out %} From f2d2d1baab311dadd34287242d9273030468c95b Mon Sep 17 00:00:00 2001 From: Artem Radzikhovskyy Date: Tue, 28 May 2024 21:52:45 -0700 Subject: [PATCH 42/42] Update sycl/test-e2e/Matrix/element_wise_all_sizes.cpp --- sycl/test-e2e/Matrix/element_wise_all_sizes.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp index e3da4f7d130eb..22ec9b98a66c1 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp +++ b/sycl/test-e2e/Matrix/element_wise_all_sizes.cpp @@ -11,7 +11,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// This is a version of the test with disabled device code // This is a version of the test with disabled device code // split to test against fixed bug in IGC // RUN: %{build} -fsycl-device-code-split=off -o %t_split.out