Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 39 additions & 44 deletions SYCL/Plugin/level_zero_sub_sub_device.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// REQUIRES: gpu-intel-pvc, level_zero
// REQUIRES: gpu-intel-pvc, level_zero

// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
Expand All @@ -11,37 +11,36 @@
// CHECK: [getZeQueue]: create queue ordinal = 0, index = 2 (round robin in [2, 2])
// CHECK: [getZeQueue]: create queue ordinal = 0, index = 3 (round robin in [3, 3])

#include "CL/sycl.hpp"
#include "CL/sycl/usm.hpp"
#include <chrono>
#include <cmath>
#include <iostream>
#include <math.h>
#include <sycl/sycl.hpp>
#include <unistd.h>

namespace sycl = cl::sycl;
using namespace sycl;
using namespace std::chrono;

#define random_float() (rand() / double(RAND_MAX))
#define INTER_NUM (150)
#define KERNEL_NUM (2000)

void run(std::vector<sycl::queue> &queues) {
void run(std::vector<queue> &queues) {
auto N = 1024 * 16;
size_t global_range = 1024;
size_t local_range = 16;

float *buffer_host0 = sycl::malloc_host<float>(N, queues[0]);
float *buffer_device0 = sycl::malloc_device<float>(N, queues[0]);
float *buffer_host0 = malloc_host<float>(N, queues[0]);
float *buffer_device0 = malloc_device<float>(N, queues[0]);

float *buffer_host1 = sycl::malloc_host<float>(N, queues[1]);
float *buffer_device1 = sycl::malloc_device<float>(N, queues[1]);
float *buffer_host1 = malloc_host<float>(N, queues[1]);
float *buffer_device1 = malloc_device<float>(N, queues[1]);

float *buffer_host2 = sycl::malloc_host<float>(N, queues[2]);
float *buffer_device2 = sycl::malloc_device<float>(N, queues[2]);
float *buffer_host2 = malloc_host<float>(N, queues[2]);
float *buffer_device2 = malloc_device<float>(N, queues[2]);

float *buffer_host3 = sycl::malloc_host<float>(N, queues[3]);
float *buffer_device3 = sycl::malloc_device<float>(N, queues[3]);
float *buffer_host3 = malloc_host<float>(N, queues[3]);
float *buffer_device3 = malloc_device<float>(N, queues[3]);

for (int i = 0; i < N; ++i) {
buffer_host0[i] = static_cast<float>(random_float());
Expand All @@ -57,38 +56,34 @@ void run(std::vector<sycl::queue> &queues) {

for (auto m = 0; m < INTER_NUM; ++m) {
for (int k = 0; k < KERNEL_NUM; ++k) {
auto event0 = queues[0].submit([&](sycl::handler &h) {
auto event0 = queues[0].submit([&](handler &h) {
h.parallel_for<class kernel0>(
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
cl::sycl::range<1>{local_range}),
[=](cl::sycl::nd_item<1> item) {
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device0[i] = buffer_device0[i] + float(2.0);
});
});
auto event1 = queues[1].submit([&](sycl::handler &h) {
auto event1 = queues[1].submit([&](handler &h) {
h.parallel_for<class kernel1>(
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
cl::sycl::range<1>{local_range}),
[=](cl::sycl::nd_item<1> item) {
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device1[i] = buffer_device1[i] + float(2.0);
});
});
auto event2 = queues[2].submit([&](sycl::handler &h) {
auto event2 = queues[2].submit([&](handler &h) {
h.parallel_for<class kernel2>(
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
cl::sycl::range<1>{local_range}),
[=](cl::sycl::nd_item<1> item) {
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device2[i] = buffer_device2[i] + float(2.0);
});
});
auto event3 = queues[3].submit([&](sycl::handler &h) {
auto event3 = queues[3].submit([&](handler &h) {
h.parallel_for<class kernel3>(
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
cl::sycl::range<1>{local_range}),
[=](cl::sycl::nd_item<1> item) {
nd_range<1>(range<1>{global_range}, range<1>{local_range}),
[=](nd_item<1> item) {
int i = item.get_global_linear_id();
buffer_device3[i] = buffer_device3[i] + float(2.0);
});
Expand Down Expand Up @@ -119,22 +114,22 @@ int main(void) {
std::cout << "[info] this case is used to submit workloads to queues on "
"subsub device"
<< std::endl;
std::vector<sycl::device> subsub;
std::vector<device> subsub;

auto devices = sycl::device::get_devices(sycl::info::device_type::gpu);
auto devices = device::get_devices(info::device_type::gpu);
std::cout << "[info] device count = " << devices.size() << std::endl;

// watch out device here
auto subdevices =
devices[1]
.create_sub_devices<
sycl::info::partition_property::partition_by_affinity_domain>(
sycl::info::partition_affinity_domain::next_partitionable);
info::partition_property::partition_by_affinity_domain>(
info::partition_affinity_domain::next_partitionable);
std::cout << "[info] sub device size = " << subdevices.size() << std::endl;
for (auto &subdev : subdevices) {
auto subsubdevices = subdev.create_sub_devices<
sycl::info::partition_property::partition_by_affinity_domain>(
sycl::info::partition_affinity_domain::next_partitionable);
info::partition_property::partition_by_affinity_domain>(
info::partition_affinity_domain::next_partitionable);
std::cout << "[info] sub-sub device size = " << subsubdevices.size()
<< std::endl;
for (auto &subsubdev : subsubdevices) {
Expand All @@ -147,16 +142,16 @@ int main(void) {
std::cout << "[important] create 4 sycl queues on first 4 sub-sub devices"
<< std::endl;

sycl::queue q0(subsub[0], {cl::sycl::property::queue::enable_profiling(),
cl::sycl::property::queue::in_order()});
sycl::queue q1(subsub[1], {cl::sycl::property::queue::enable_profiling(),
cl::sycl::property::queue::in_order()});
sycl::queue q2(subsub[2], {cl::sycl::property::queue::enable_profiling(),
cl::sycl::property::queue::in_order()});
sycl::queue q3(subsub[4], {cl::sycl::property::queue::enable_profiling(),
cl::sycl::property::queue::in_order()});
queue q0(subsub[0],
{property::queue::enable_profiling(), property::queue::in_order()});
queue q1(subsub[1],
{property::queue::enable_profiling(), property::queue::in_order()});
queue q2(subsub[2],
{property::queue::enable_profiling(), property::queue::in_order()});
queue q3(subsub[4],
{property::queue::enable_profiling(), property::queue::in_order()});

std::vector<sycl::queue> queues;
std::vector<queue> queues;

queues.push_back(std::move(q0));
queues.push_back(std::move(q1));
Expand Down