-
Notifications
You must be signed in to change notification settings - Fork 798
Description
Discussed in #4185
Originally posted by bagrorg July 26, 2021
Initially, I wanted to post this as an issue, but then I doubted whether it was a bug or my mistake.
I have a kernel, after I changed gpu_selector to cpu_selector, I started getting the following error:
**Internal compiler error** Calls with kernel-call-once attributes cannot be serialized.
Please report the issue on Intel OpenCL forum
https://software.intel.com/en-us/forums/opencl for assistance.
Aborted (core dumped)
I started looking for where it could have occurred and it seems that it appears in this code snippet (I did not post the entire kernel, because it is quite large)
//_iter is a lsit's node pointer
bool find = false;
bool total_found = false;
for(int i = ind; i <= ind + SUBGROUP_SIZE * (CONST - 1); i += SUBGROUP_SIZE) {
find = ((_iter->data[i].first) == key);
sycl::group_barrier(_gr);
total_found = sycl::any_of_group(_gr, find);
if (total_found) {
for (int j = 0; j < SUBGROUP_SIZE; j++) {
if (sycl::group_broadcast(_gr, find, j)) {
uint32_t tmp = 0;
if (ind == j) tmp = _iter->data[i].second;
ans = {sycl::group_broadcast(_gr, tmp, j), true};
break;
}
}
}
}
If this error occurs due to my fault, then why does it occur and how can I fix it?
Environment
- OS: Linux
- Device: Intel(R) Core(TM) i7-6700 CPU
- Compiler version - 6ac26ad
Originally posted by bagrorg Aug 4, 2021
Please forgive me for such a long delay.
I have restored the conditions in which the error occurred and am sharing the code in which the kernel I mentioned is called
Sorry for the fact that so much, due to the fact that I do not know exactly where the error was, it was hard for me to say what could be deleted and what could not, I tried to remove unnecessary lines.
int main() {
std::vector<pair<uint32_t, uint32_t>> testUniv = {
{1, 2}, {5, 2}, {101, 3}, {21312, 5}, {3, 0}, {10, 10}};
sycl::queue q{sycl::gpu_selector()};
sycl::nd_range<1> r{SUBGROUP_SIZE * 3, SUBGROUP_SIZE};
std::vector<SlabList<pair<uint32_t, uint32_t>>> lists(BUCKETS_COUNT); //PREPARATION
for (auto &e : lists) {
e.root = sycl::global_ptr<SlabNode<pair<uint32_t, uint32_t>>>(
sycl::malloc_shared<SlabNode<pair<uint32_t, uint32_t>>>(CLUSTER_SIZE,
q));
*(e.root) = SlabNode<pair<uint32_t, uint32_t>>({EMPTY_UINT32_T, 0});
}
DefaultHasher<13, 24, 343> h;
for (auto &e : testUniv) {
auto r = lists[h(e.first)].root;
for (int i = 0; i < SLAB_SIZE; i++) {
if (r->data[i].first == EMPTY_UINT32_T) {
r->data[i] = e;
break;
}
}
}
std::vector<pair<bool, bool>> checks(6); //END OF PREPARATION
{
sycl::buffer<SlabList<pair<uint32_t, uint32_t>>> ls(lists);
sycl::buffer<sycl::global_ptr<SlabNode<pair<uint32_t, uint32_t>>>> its(3);
sycl::buffer<pair<uint32_t, uint32_t>> buffTestUniv(testUniv);
sycl::buffer<pair<bool, bool>> buffChecks(checks);
q.submit([&](sycl::handler &cgh) {
auto l = sycl::accessor(ls, cgh, sycl::read_write);
auto tests = sycl::accessor(buffTestUniv, cgh, sycl::read_only);
auto itrs = sycl::accessor(its, cgh, sycl::read_write);
auto accChecks = sycl::accessor(buffChecks, cgh, sycl::write_only);
cgh.parallel_for<class insert_test_slab_both>( // <- KERNEL
r, [=](sycl::nd_item<1> it) {
auto _lists = l.get_pointer();
sycl::global_ptr<SlabNode<pair<uint32_t, uint32_t>>> &_iter =
itrs[it.get_group().get_id()];
size_t _ind = it.get_local_id();
auto _empty = EMPTY_UINT32_T;
const sycl::group<1> &_gr = it.get_group();
size_t _group_ind = _gr.get_id();
DefaultHasher<13, 24, 343> _hasher;
for (int i = _group_ind * 2; i < _group_ind * 2 + 2; i++) {
std::optional<uint32_t> _ans;
auto _key = tests[i].first;
auto _val = tests[i].second;
if (_ind == 0) {
_iter = (_lists + _hasher(_key))->root;
}
sycl::group_barrier(_gr);
while (_iter != nullptr) {
bool found_in_node = false;
bool total_found = false;
bool find = false;
for (int i = _ind; i <= _ind + SUBGROUP_SIZE * (CONST - 1);
i += SUBGROUP_SIZE) {
find = ((_iter->data[i].first) == _key);
sycl::group_barrier(_gr);
total_found = sycl::any_of_group(_gr, find);
if (total_found) {
bool found_in_subgroup = false;
for (int j = 0; j < SUBGROUP_SIZE; j++) {
if (cl::sycl::group_broadcast(_gr, find, j)) {
uint32_t tmp;
if (_ind == j)
tmp = _iter->data[i].second;
_ans = std::optional<uint32_t>{
cl::sycl::group_broadcast(_gr, tmp, j)};
break;
}
}
if (found_in_subgroup) {
found_in_node = true;
break;
}
}
}
if (found_in_node) {
break;
} else if (_ind == 0) {
_iter = _iter->next;
}
sycl::group_barrier(_gr);
}
if (it.get_local_id() == 0)
accChecks[i] = {static_cast<bool>(_ans),
_ans.value_or(-1) == tests[i].second};
}
});
}).wait();
}
for (auto &e : lists) {
sycl::free(e.root, q);
}
}
The mentioned error occurs if you replace gpu_selector with cpu_selector at the very beginning.