Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ad8a477
SG32 #define SG_SZ
artemrad May 14, 2024
cd17776
XMX8 no SG_SZ
artemrad May 14, 2024
ae00144
WIP abc_impl: remove SG_SZ
artemrad May 15, 2024
d267874
Made tests independant of SG_SZ
artemrad May 15, 2024
0a71b88
clang-format
artemrad May 15, 2024
5621804
Clean up nits
artemrad May 16, 2024
23a7afc
Managed to remove the static code
artemrad May 16, 2024
a3c310b
Pass: elemwise_irreg_size_ops_bf16.cpp
artemrad May 21, 2024
22da1c2
Pass: joint_matrix_annotated_ptr
artemrad May 21, 2024
850f30b
Pass: joint_matrix_bfloat16_colmajorA_colmajorB
artemrad May 21, 2024
e7fcb5c
Pass: joint_matrix_int8_colmajorA_colmajorB
artemrad May 21, 2024
596aaff
Pass: joint_matrix_prefetch
artemrad May 21, 2024
fcbac58
Merge branch 'sycl' into sg_size
artemrad May 21, 2024
3563d69
Fixed sycl merge and joint_matrix_prefetch
artemrad May 21, 2024
c1bca5e
Fixed CPU tests
artemrad May 21, 2024
3fe79da
clang-format
artemrad May 21, 2024
8dc3756
clang-format
artemrad May 21, 2024
516f648
Undo changes
artemrad May 21, 2024
b48d61d
SG32 #define SG_SZ
artemrad May 14, 2024
c23311c
XMX8 no SG_SZ
artemrad May 14, 2024
db8cd7e
WIP abc_impl: remove SG_SZ
artemrad May 15, 2024
5877ed2
Made tests independant of SG_SZ
artemrad May 15, 2024
a5e15a2
clang-format
artemrad May 15, 2024
4cc31dd
Clean up nits
artemrad May 16, 2024
498fa1e
Managed to remove the static code
artemrad May 16, 2024
8ab7f80
element_wise_abc
artemrad May 17, 2024
9049d2f
WIP: element_wise_all_ops_half
artemrad May 21, 2024
6eae2da
Expanded tests that use combos
artemrad May 21, 2024
2fa9a43
Merge branch 'sg_size' into xmx8
artemrad May 21, 2024
4a9529a
Merge branch 'sycl' into xmx8
artemrad May 22, 2024
2a6b455
PassedL element_wise_all_ops_int8_packed
artemrad May 22, 2024
c88783b
Pass: element_wise_all_ops_int8
artemrad May 22, 2024
317e3c2
Pass: element_wise_all_sizes
artemrad May 22, 2024
7d51e9c
clang-format and nits
artemrad May 22, 2024
df1be4f
fixed requires
artemrad May 22, 2024
ced84eb
fixed element_wise_all_ops_int8
artemrad May 23, 2024
8b0e59d
CPU works element_wise_all_ops_half
artemrad May 27, 2024
925c241
CPU pass element_wise_all_ops_int8_packed
artemrad May 27, 2024
894d5f8
removed XMX8/element_wise_all_sizes_no_split
artemrad May 27, 2024
0a6a182
clang-format
artemrad May 27, 2024
72f021e
Typo in /element_wise_all_ops_half
artemrad May 28, 2024
587137c
nits
artemrad May 28, 2024
7cabdc7
Update sycl/test-e2e/Matrix/element_wise_all_ops_int8_packed_impl.hpp
artemrad May 29, 2024
0ed46c9
Update sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
artemrad May 29, 2024
f2d2d1b
Update sycl/test-e2e/Matrix/element_wise_all_sizes.cpp
artemrad May 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_abc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix
// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
// UNSUPPORTED: gpu-intel-dg2
// REQUIRES: aspect-ext_intel_matrix
// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943

// RUN: %{build} -o %t.out
Expand All @@ -14,6 +16,5 @@
#include "../common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;

#include "../element_wise_abc_impl.hpp"
6 changes: 3 additions & 3 deletions sycl/test-e2e/Matrix/SG32/element_wise_all_ops_half.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
// UNSUPPORTED: gpu-intel-dg2
// REQUIRES: aspect-fp16
// REQUIRES: matrix,gpu
// REQUIRES: matrix-fp16
// REQUIRES: aspect-ext_intel_matrix, gpu
// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943

// RUN: %{build} -o %t.out
Expand All @@ -16,6 +17,5 @@
#include "../common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;

#include "../element_wise_all_ops_half_impl.hpp"
5 changes: 3 additions & 2 deletions sycl/test-e2e/Matrix/SG32/element_wise_all_ops_int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix
// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
// UNSUPPORTED: gpu-intel-dg2
// REQUIRES: aspect-ext_intel_matrix
// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943

// RUN: %{build} -o %t.out
Expand All @@ -14,6 +16,5 @@
#include "../common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;

#include "../element_wise_all_ops_int8_impl.hpp"
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix
// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
// UNSUPPORTED: gpu-intel-dg2
// REQUIRES: aspect-ext_intel_matrix
// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943

// RUN: %{build} -o %t.out
Expand All @@ -16,6 +18,5 @@
#include "../common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;

#include "../element_wise_all_ops_int8_packed_impl.hpp"
17 changes: 0 additions & 17 deletions sycl/test-e2e/Matrix/XMX8/element_wise_abc.cpp

This file was deleted.

19 changes: 0 additions & 19 deletions sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_half.cpp

This file was deleted.

17 changes: 0 additions & 17 deletions sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8.cpp

This file was deleted.

19 changes: 0 additions & 19 deletions sycl/test-e2e/Matrix/XMX8/element_wise_all_ops_int8_packed.cpp

This file was deleted.

18 changes: 0 additions & 18 deletions sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes.cpp

This file was deleted.

21 changes: 0 additions & 21 deletions sycl/test-e2e/Matrix/XMX8/element_wise_all_sizes_no_split.cpp

This file was deleted.

5 changes: 1 addition & 4 deletions sycl/test-e2e/Matrix/element_wise_abc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix
// REQUIRES: aspect-ext_intel_matrix

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "common.hpp"

constexpr size_t TN = 16;

#include "element_wise_abc_impl.hpp"
63 changes: 43 additions & 20 deletions sycl/test-e2e/Matrix/element_wise_abc_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,26 @@
using namespace sycl;
using namespace sycl::ext::oneapi::experimental::matrix;

#define TM 8
#define TK 32
template <size_t M, size_t N, size_t K, int vnniFactor> class add;

template <typename T1, typename T2, size_t M, size_t N, size_t K,
int vnniFactor>
void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
big_matrix<T2, K / vnniFactor, N * vnniFactor> &B) {
size_t NDRangeM = M / TM;
size_t NDRangeN = N / TN;
size_t NDRangeM = 1;
size_t NDRangeN = 1;
buffer<T2, 2> bufA(A.get_data(), range<2>(M, K));
buffer<T2, 2> bufB(B.get_data(), range<2>(K, N));
buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));

queue q;
size_t sg_size = get_sg_size<class add>(q);
size_t sg_size = get_sg_size<add<M, N, K, vnniFactor>>(q);
q.submit([&](handler &cgh) {
accessor accC{bufC, cgh};
accessor accA{bufA, cgh};
accessor accB{bufB, cgh};

cgh.parallel_for<class add>(
cgh.parallel_for<add<M, N, K, vnniFactor>>(
nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
[=](nd_item<2> spmd_item)
#ifdef SG_SZ
Expand All @@ -49,48 +48,72 @@ void matrix_elem_wise_ops(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
const auto sg_starty = global_idy - spmd_item.get_local_id(1);

sub_group sg = spmd_item.get_sub_group();
joint_matrix<sub_group, T2, use::a, TM, TK, layout::row_major> sub_a;
joint_matrix<sub_group, T2, use::a, M, K, layout::row_major> sub_a;
// For B, we assume B has been already VNNIed.
joint_matrix<sub_group, T2, use::b, TK, TN, layout::ext_intel_packed>
joint_matrix<sub_group, T2, use::b, K, N, layout::ext_intel_packed>
sub_b;
joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_c;
joint_matrix<sub_group, T1, use::accumulator, M, N> sub_c;

joint_matrix_load(
sg, sub_a,
accA.template get_multi_ptr<access::decorated::no>() +
(sg_startx * TM) * K,
(sg_startx * M) * K,
K);
joint_matrix_apply(sg, sub_a, [](T2 &x) { x += 1; });

joint_matrix_load(
sg, sub_b,
accB.template get_multi_ptr<access::decorated::no>() +
sg_starty / sg_size * TN * vnniFactor,
sg_starty / sg_size * N * vnniFactor,
N * vnniFactor);
joint_matrix_apply(sg, sub_b, [](T2 &x) { x += 1; });

joint_matrix_load(
sg, sub_c,
accC.template get_multi_ptr<access::decorated::no>() +
(sg_startx * TM) * N + sg_starty / sg_size * TN,
(sg_startx * M) * N + sg_starty / sg_size * N,
N, layout::row_major);
joint_matrix_apply(sg, sub_c, [](T1 &x) { x += 1; });
}); // parallel for
}).wait();
}

template <typename Ta, typename Tc, size_t TM, size_t TN, size_t TK, size_t VF>
void test() {
Tc A[TM][TK];
Tc B[TK / VF][TN * VF];
Ta C[TM][TN];

big_matrix<Ta, TM, TN> MC((Ta *)&C);
big_matrix<Tc, TM, TK> MA((Tc *)&A);
big_matrix<Tc, TK / VF, TN * VF> MB((Tc *)&B);

return matrix_elem_wise_ops<Ta, int8_t, TM, TN, TK, VF>(MC, MA, MB);
}

int main() {
static constexpr unsigned vnniFactor = 4;
queue q;
std::vector<combination> combinations =
q.get_device()
.get_info<sycl::ext::oneapi::experimental::info::device::
matrix_combinations>();

int8_t A[TM][TK];
int8_t B[TK / vnniFactor][TN * vnniFactor];
int32_t C[TM][TN];
for (unsigned int i = 0; i < combinations.size(); i++) {
if (combinations[i].nsize == 0) { // Intel AMX
test<int32_t, int8_t, 16, 16, 64, 4>();
break;
}

big_matrix<int32_t, TM, TN> MC((int32_t *)&C);
big_matrix<int8_t, TM, TK> MA((int8_t *)&A);
big_matrix<int8_t, TK / vnniFactor, TN * vnniFactor> MB((int8_t *)&B);
if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
test<int32_t, int8_t, 8, 16, 32, 4>();
break;
}

matrix_elem_wise_ops<int32_t, int8_t, TM, TN, TK, vnniFactor>(MC, MA, MB);
if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
test<int32_t, int8_t, 8, 8, 32, 4>();
break;
}
}

return 0;
}
5 changes: 1 addition & 4 deletions sycl/test-e2e/Matrix/element_wise_all_ops_half.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,11 @@
//
//===----------------------------------------------------------------------===//
// REQUIRES: aspect-fp16
// REQUIRES: matrix
// REQUIRES: aspect-ext_intel_matrix
// REQUIRES: matrix-fp16

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "common.hpp"

constexpr size_t TN = 16;

#include "element_wise_all_ops_half_impl.hpp"
Loading