Add link for SYCL runtime #581

abhilash1910 · 2024-06-12T16:28:17Z

chin-jey · 2024-06-15T21:16:21Z

The kernels under dev/sycl don't compile for me on Intel devcloud, e.g.

icpx -fsycl attention_forward.dp.cpp -o attn_fwd -qmkl=parallel

which yields

In file included from attention_forward.dp.cpp:57:
./common.h:60:7: error: no member named 'blas' in namespace 'dpct'
   60 | dpct::blas::descriptor_ptr cublas_handle;
      | ~~~~~~^
./common.h:61:1: error: unknown type name 'cublasLtHandle_t'
   61 | cublasLtHandle_t cublaslt_handle;
      | ^
./common.h:138:9: error: use of undeclared identifier '__ldcs'
  138 |         __ldcs(reinterpret_cast<const sycl::int4 *>(address))};
      |         ^
./common.h:216:35: warning: implicit conversion from 'int' to 'float' changes value from 2147483647 to 2147483648 [-Wimplicit-const-int-float-conversion]
  216 |         arr[i] = ((float)rand() / RAND_MAX); // range 0..1
      |                                 ~ ^~~~~~~~
/usr/include/stdlib.h:87:18: note: expanded from macro 'RAND_MAX'
   87 | #define RAND_MAX        2147483647
      |                         ^~~~~~~~~~
In file included from attention_forward.dp.cpp:57:
./common.h:224:35: warning: implicit conversion from 'int' to 'float' changes value from 2147483647 to 2147483648 [-Wimplicit-const-int-float-conversion]
  224 |         arr[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; // range -1..1
      |                                 ~ ^~~~~~~~
/usr/include/stdlib.h:87:18: note: expanded from macro 'RAND_MAX'
   87 | #define RAND_MAX        2147483647
      |                         ^~~~~~~~~~
In file included from attention_forward.dp.cpp:57:
./common.h:307:60: error: no member named 'blas' in namespace 'dpct'
  307 |     cublasCheck(DPCT_CHECK_ERROR(cublas_handle = new dpct::blas::descriptor()));
      |                                                      ~~~~~~^
/glob/development-tools/versions/oneapi/2024.1/oneapi/dpcpp-ct/2024.1/include/dpct/dpct.hpp:53:7: note: expanded from macro 'DPCT_CHECK_ERROR'
   53 |       expr;                                                                    \
      |       ^~~~
./common.h:50:44: note: expanded from macro 'cublasCheck'
   50 | #define cublasCheck(status) { cublasCheck((status), __FILE__, __LINE__); }
      |                                            ^~~~~~
./common.h:325:41: error: use of undeclared identifier 'CUBLAS_COMPUTE_32F_FAST_TF32'
  325 |     cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
      |                                         ^
./common.h:325:72: error: use of undeclared identifier 'CUBLAS_COMPUTE_32F'
  325 |     cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
      |                                                                        ^
./common.h:327:23: error: use of undeclared identifier 'CUBLAS_TF32_TENSOR_OP_MATH'
  327 |         enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
      |                       ^
./common.h:327:52: error: use of undeclared identifier 'CUBLAS_DEFAULT_MATH'
  327 |         enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
      |                                                    ^
./common.h:433:19: error: no member named 'sync_barrier' in namespace 'dpct'
  433 |             dpct::sync_barrier(start, &dpct::get_in_order_queue())));
      |             ~~~~~~^
/glob/development-tools/versions/oneapi/2024.1/oneapi/dpcpp-ct/2024.1/include/dpct/dpct.hpp:53:7: note: expanded from macro 'DPCT_CHECK_ERROR'
   53 |       expr;                                                                    \
      |       ^~~~
./common.h:40:36: note: expanded from macro 'cudaCheck'
   40 | #define cudaCheck(err) (cuda_check(err, __FILE__, __LINE__))
      |                                    ^~~
./common.h:441:19: error: no member named 'sync_barrier' in namespace 'dpct'
  441 |             dpct::sync_barrier(stop, &dpct::get_in_order_queue())));
      |             ~~~~~~^
/glob/development-tools/versions/oneapi/2024.1/oneapi/dpcpp-ct/2024.1/include/dpct/dpct.hpp:53:7: note: expanded from macro 'DPCT_CHECK_ERROR'
   53 |       expr;                                                                    \
      |       ^~~~
./common.h:40:36: note: expanded from macro 'cudaCheck'
   40 | #define cudaCheck(err) (cuda_check(err, __FILE__, __LINE__))
      |                                    ^~~
attention_forward.dp.cpp:387:44: error: no member named 'meta_group_size' in 'sycl::sub_group'; did you mean 'get_group_id'?
  387 |     int idx = item_ct1.get_group(2) * warp.meta_group_size() +
      |                                            ^~~~~~~~~~~~~~~
      |                                            get_group_id
/glob/development-tools/versions/oneapi/2024.1/oneapi/compiler/2024.1/bin/compiler/../../include/sycl/sub_group.hpp:192:11: note: 'get_group_id' declared here
  192 |   id_type get_group_id() const {
      |           ^
attention_forward.dp.cpp:442:44: error: use of undeclared identifier '__ldcs'
  442 |         float ev = expf(inv_temperature * (__ldcs(x + i) - global_maxval));
      |                                            ^
attention_forward.dp.cpp:443:9: warning: implicitly declaring library function '__stosb' with type 'void (unsigned char *, unsigned char, unsigned long) noexcept' [-Wimplicit-function-declaration]
  443 |         __stcs(out + idx * T + i, ev * norm);
      |         ^
attention_forward.dp.cpp:443:9: note: include the header <intrin.h> or explicitly provide a declaration for '__stosb'
attention_forward.dp.cpp:443:9: error: use of undeclared identifier '__stcs'
  443 |         __stcs(out + idx * T + i, ev * norm);
      |         ^
attention_forward.dp.cpp:719:42: error: no member named 'meta_group_size' in 'sycl::sub_group'; did you mean 'get_group_id'?
  719 |     int t = item_ct1.get_group(2) * warp.meta_group_size() +
      |                                          ^~~~~~~~~~~~~~~
      |                                          get_group_id
/glob/development-tools/versions/oneapi/2024.1/oneapi/compiler/2024.1/bin/compiler/../../include/sycl/sub_group.hpp:192:11: note: 'get_group_id' declared here
  192 |   id_type get_group_id() const {
      |           ^
attention_forward.dp.cpp:883:44: error: use of undeclared identifier 'cudaDevAttrMaxSharedMemoryPerBlock'
  883 |     cudaDeviceGetAttribute(&max_sram_size, cudaDevAttrMaxSharedMemoryPerBlock, 0);
      |                                            ^
attention_forward.dp.cpp:1172:44: error: no member named 'meta_group_size' in 'sycl::sub_group'; did you mean 'get_group_id'?
 1172 |     int idx = item_ct1.get_group(2) * warp.meta_group_size() +
      |                                            ^~~~~~~~~~~~~~~
      |                                            get_group_id
/glob/development-tools/versions/oneapi/2024.1/oneapi/compiler/2024.1/bin/compiler/../../include/sycl/sub_group.hpp:192:11: note: 'get_group_id' declared here
  192 |   id_type get_group_id() const {
      |           ^
attention_forward.dp.cpp:1333:23: error: use of undeclared identifier 'CUBLAS_COMPUTE_32F'
 1333 |     void* alpha_ptr = CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F ? (void*)&alpha_lowp : (void*)&alpha;
      |                       ^
./common.h:177:29: note: expanded from macro 'CUBLAS_LOWP_COMPUTE'
  177 | #define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F
      |                             ^
attention_forward.dp.cpp:1333:46: error: use of undeclared identifier 'CUBLAS_COMPUTE_16F'
 1333 |     void* alpha_ptr = CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F ? (void*)&alpha_lowp : (void*)&alpha;
      |                                              ^
attention_forward.dp.cpp:1334:22: error: use of undeclared identifier 'CUBLAS_COMPUTE_32F'
 1334 |     void* beta_ptr = CUBLAS_LOWP_COMPUTE == CUBLAS_COMPUTE_16F ? (void*)&beta_lowp : (void*)&beta;
      |                      ^
./common.h:177:29: note: expanded from macro 'CUBLAS_LOWP_COMPUTE'
  177 | #define CUBLAS_LOWP_COMPUTE CUBLAS_COMPUTE_32F
      |                             ^
fatal error: too many errors emitted, stopping now [-ferror-limit=]
3 warnings and 20 errors generated.

Are they meant to be compiled differently?

abhilash1910 · 2024-06-16T05:08:23Z

Hi @chin-jey , please use the updated llmc/ kernels ( the dev ones are older verions and not updated yet)
. Please follow the instructions inside the readme . For running attenion.cpp inside llmc/attention.cpp, use
make llmc/attention

chin-jey · 2024-06-16T19:54:39Z

Hi @abhilash1910, I wanted to be able to run the version under dev though. However, I found this other repo which ported most of the kernels so I'm using that instead!

abhilash1910 · 2024-06-17T00:18:44Z

The dev folder contains older kernels, which will be supported eventually. Currently it supports the prime kernels required for end to end runs and benchmarks - train gpt2 /32 which includes headers under llmc. This includes cudnn kernels as well.
I guess it would take a few days to see the upgraded deb folder in the repository as currently focus is on public end to end benchmarks.

abhilash1910 added 2 commits June 12, 2024 21:54

add link for sycl

d104075

add link for sycl

f92e1be

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add link for SYCL runtime #581

Add link for SYCL runtime #581

abhilash1910 commented Jun 12, 2024

chin-jey commented Jun 15, 2024

abhilash1910 commented Jun 16, 2024

chin-jey commented Jun 16, 2024

abhilash1910 commented Jun 17, 2024

Add link for SYCL runtime #581

Are you sure you want to change the base?

Add link for SYCL runtime #581

Conversation

abhilash1910 commented Jun 12, 2024

chin-jey commented Jun 15, 2024

abhilash1910 commented Jun 16, 2024

chin-jey commented Jun 16, 2024

abhilash1910 commented Jun 17, 2024