@@ -50,8 +50,8 @@ std::string toString(dpas_argument_type T) {
5050 return " bf16" ;
5151 case dpas_argument_type::tf32:
5252 return " tf32" ;
53- case dpas_argument_type::S1 :
54- case dpas_argument_type::U1 :
53+ case dpas_argument_type::s1 :
54+ case dpas_argument_type::u1 :
5555 case dpas_argument_type::Invalid:
5656 return " UNSUPPORTED" ;
5757 }
@@ -65,7 +65,7 @@ template <dpas_argument_type T> struct DpasPrintType {
6565 static constexpr bool is_uint = T == dpas_argument_type::u2 ||
6666 T == dpas_argument_type::u4 ||
6767 T == dpas_argument_type::u8 ;
68- static constexpr bool is_fp = T == dpas_argument_type::FP16 ||
68+ static constexpr bool is_fp = T == dpas_argument_type::fp16 ||
6969 T == dpas_argument_type::bf16 ||
7070 T == dpas_argument_type::tf32;
7171
@@ -100,7 +100,7 @@ template <dpas_argument_type T> struct DpasNaturalOperandType {
100100 is_uint, unsigned char ,
101101 std::conditional_t <
102102 is_fp16, sycl::half,
103- std::conditional <
103+ std::conditional_t <
104104 is_bf16, sycl::ext::oneapi::experimental::bfloat16, void >>>>;
105105};
106106
@@ -123,6 +123,11 @@ template <dpas_argument_type T> constexpr int getBitSize() {
123123
124124 case dpas_argument_type::tf32:
125125 return 32 ;
126+
127+ case dpas_argument_type::Invalid:
128+ case dpas_argument_type::s1:
129+ case dpas_argument_type::u1:
130+ break ;
126131 }
127132 return 0 ;
128133}
@@ -282,7 +287,8 @@ void printMatrix(void *Vec, std::string Msg) {
282287}
283288
284289template <int SystolicDepth, int RepeatCount, dpas_argument_type BPrec,
285- dpas_argument_type APrec, bool UseSrc0>
290+ dpas_argument_type APrec, bool UseSrc0, int ExecSize,
291+ bool LetDeduceArgs>
286292bool test (queue &Q, bool Print) {
287293 constexpr unsigned Size = 128 ;
288294 constexpr unsigned VL = 16 ;
@@ -300,12 +306,13 @@ bool test(queue &Q, bool Print) {
300306 // where:
301307 constexpr int M = RepeatCount;
302308 constexpr int K = SystolicDepth * OpsPerChannel;
303- constexpr int N = 16 ; // Execution size: 16 for PVC.
309+ constexpr int N = ExecSize ; // 16 for PVC, 8 for DG2 .
304310
305311 auto Dev = Q.get_device ();
306- std::cout << " Running test case " << toString (BPrec, APrec)
307- << " with UseSrc0 = " << UseSrc0 << " on "
308- << Dev.get_info <info::device::name>() << " \n " ;
312+ std::cout << " Running on " << Dev.get_info <info::device::name>()
313+ << " (ExecSize = " << ExecSize << " ): " << toString (BPrec, APrec)
314+ << " , UseSrc0 = " << UseSrc0
315+ << " , LetDeduceArgs = " << LetDeduceArgs << std::endl;
309316
310317 using ANaturalType = typename DpasNaturalOperandType<APrec>::type;
311318 using BNaturalType = typename DpasNaturalOperandType<BPrec>::type;
@@ -317,10 +324,10 @@ bool test(queue &Q, bool Print) {
317324 auto BPacked = aligned_alloc_shared<BNaturalType>(128 , BPackedSize, Q);
318325 auto Res = aligned_alloc_shared<ResNaturalType>(128 , M * N, Q);
319326 // Init APacked;
320- int Value = 0 ;
327+ float Value = 1.2 ;
321328 for (int II = 0 ; II < M; II++) {
322329 for (int JJ = 0 ; JJ < K; JJ++) {
323- Value++ ;
330+ Value += 1.1 ;
324331 writeToHorizontallyPackedMatrix<M, K, APrec>(
325332 APacked, II, JJ, static_cast <ANaturalType>(Value));
326333 }
@@ -345,15 +352,27 @@ bool test(queue &Q, bool Print) {
345352 simd<BNaturalType, BPackedSize> B (BPacked, overaligned_tag<16 >{});
346353 simd<ResNaturalType, M * N> C;
347354
348- if constexpr (UseSrc0) {
349- // Compute C = C + AxB;
350- C = 1 ;
351- C = dpas<8 , RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
352- ANaturalType, BPrec, APrec>(C, B, A);
355+ if constexpr (LetDeduceArgs) {
356+ if constexpr (UseSrc0) {
357+ // Compute C = C + AxB;
358+ C = 1 ;
359+ C = dpas<8 , RepeatCount, ResNaturalType>(C, B, A);
360+ } else {
361+ // Compute C = AxB;
362+ C = dpas<8 , RepeatCount, ResNaturalType>(B, A);
363+ }
364+
353365 } else {
354- // Compute C = AxB;
355- C = dpas<8 , RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
356- BPrec, APrec>(B, A);
366+ if constexpr (UseSrc0) {
367+ // Compute C = C + AxB;
368+ C = 1 ;
369+ C = dpas<8 , RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
370+ ANaturalType, BPrec, APrec>(C, B, A);
371+ } else {
372+ // Compute C = AxB;
373+ C = dpas<8 , RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
374+ BPrec, APrec>(B, A);
375+ }
357376 }
358377
359378 C.copy_to (Res);
@@ -396,11 +415,40 @@ bool test(queue &Q, bool Print) {
396415}
397416
398417template <int SystolicDepth, int RepeatCount, dpas_argument_type T1,
399- dpas_argument_type T2>
418+ dpas_argument_type T2, bool LetDeduceArgs = false >
400419bool tests (queue Q, bool Print) {
401420 bool Passed = true ;
402421 constexpr bool UseSrc0 = true ;
403- Passed &= test<SystolicDepth, RepeatCount, T1, T2, UseSrc0>(Q, Print);
404- Passed &= test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0>(Q, Print);
422+ auto Dev = Q.get_device ();
423+
424+ // Detect the execution size.
425+ // The device trait is not implemented for esimd_emulator. Use both 8 and 16.
426+ int ExecSize;
427+ bool IsEmulator = false ;
428+ try {
429+ ExecSize = Dev.get_info <ext::intel::info::device::gpu_eu_simd_width>();
430+ } catch (sycl::exception e) {
431+ IsEmulator = true ;
432+ }
433+ assert ((IsEmulator || (ExecSize == 8 || ExecSize == 16 )) &&
434+ " Execution size must be 8 or 16" );
435+
436+ if (ExecSize == 16 || IsEmulator) {
437+ Passed &=
438+ test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 16 , LetDeduceArgs>(
439+ Q, Print);
440+ Passed &=
441+ test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 16 , LetDeduceArgs>(
442+ Q, Print);
443+ }
444+ if (ExecSize == 8 || IsEmulator) {
445+ Passed &=
446+ test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 8 , LetDeduceArgs>(
447+ Q, Print);
448+ Passed &=
449+ test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 8 , LetDeduceArgs>(
450+ Q, Print);
451+ }
452+
405453 return Passed;
406454}
0 commit comments