Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7299e7c
[SYCL] Pooling of USM memory allocated for buffers.
rdeodhar Mar 25, 2021
835fec3
Removed dead code.
rdeodhar Mar 25, 2021
abcffba
Added test.
rdeodhar Mar 26, 2021
dd0b5c6
Restrict test to level_zero.
rdeodhar Mar 26, 2021
cc809e4
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Mar 26, 2021
762cb2a
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Mar 30, 2021
b47c296
Changed defaults; other review comments.
rdeodhar Mar 30, 2021
66c8138
Test corrected and build error fixed.
rdeodhar Mar 31, 2021
3abfa22
Test correction.
rdeodhar Mar 31, 2021
f875ade
Change to alignment.
rdeodhar Apr 1, 2021
31a1e77
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Apr 1, 2021
1058924
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Apr 1, 2021
d3a03a7
Change to buffer alignment.
rdeodhar Apr 1, 2021
00c87db
Changes based on review comments.
rdeodhar Apr 6, 2021
27c3960
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Apr 8, 2021
f2f8c54
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Apr 13, 2021
1733de6
Review responses.
rdeodhar Apr 13, 2021
9391aa2
Fix for max poolable size.
rdeodhar Apr 13, 2021
eae431b
Clarified env var settings.
rdeodhar Apr 14, 2021
2b4cca0
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Apr 14, 2021
7f1aa63
Merge branch 'sycl' of https://github.com/intel/llvm into ibuf6
rdeodhar Apr 16, 2021
44d5a0c
Changed some function names for uniformity. Added comments.
rdeodhar Apr 16, 2021
7d95049
Modified the env var documentation.
rdeodhar Apr 16, 2021
1aa0a12
Moved pool settings into a class. Added comments.
rdeodhar Apr 16, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/doc/EnvironmentVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ subject to change. Do not rely on these variables in production code.
| SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
| SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
| SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) |
| SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR | MaxPoolableSize,Capacity,MaxPoolSize | Values specified as positive integers. Defaults are 1, 4, 256. MaxPoolableSize is the maximum allocation size in MB that may be pooled. Capacity is the number of allocations in each size range that are freed by the program but retained in the pool for reallocation. Size ranges follow this pattern: 32, 48, 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. MaxPoolSize is the maximum size of the pool in MB. |
| SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
| SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. |
| SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The default is 1. |
Expand Down
147 changes: 103 additions & 44 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2458,7 +2458,6 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
}

void *Ptr;
ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice;

// We treat integrated devices (physical memory shared with the CPU)
// differently from discrete devices (those with distinct memories).
Expand All @@ -2478,20 +2477,33 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
//
}

// Choose an alignment that is at most 64 and is the next power of 2 for sizes
// less than 64.
auto Alignment = Size;
if (Alignment > 32UL)
Alignment = 64UL;
else if (Alignment > 16UL)
Alignment = 32UL;
else if (Alignment > 8UL)
Alignment = 16UL;
else if (Alignment > 4UL)
Alignment = 8UL;
else if (Alignment > 2UL)
Alignment = 4UL;
else if (Alignment > 1UL)
Alignment = 2UL;
else
Alignment = 1UL;

pi_result Result;
if (DeviceIsIntegrated) {
ze_host_mem_alloc_desc_t ZeDesc = {};
ZeDesc.flags = 0;

ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, &Ptr));

Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment);
} else {
ze_device_mem_alloc_desc_t ZeDesc = {};
ZeDesc.flags = 0;
ZeDesc.ordinal = 0;

ZE_CALL(zeMemAllocDevice,
(Context->ZeContext, &ZeDesc, Size, 1, ZeDevice, &Ptr));
Result = piextUSMDeviceAlloc(&Ptr, Context, Context->Devices[0], nullptr,
Size, Alignment);
}
if (Result != PI_SUCCESS)
return Result;

if (HostPtr) {
if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
Expand Down Expand Up @@ -2559,7 +2571,7 @@ pi_result piMemRelease(pi_mem Mem) {
} else {
auto Buf = static_cast<_pi_buffer *>(Mem);
if (!Buf->isSubBuffer()) {
ZE_CALL(zeMemFree, (Mem->Context->ZeContext, Mem->getZeHandle()));
PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
}
}
delete Mem;
Expand Down Expand Up @@ -5388,39 +5400,17 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program,
return mapError(ZeResult);
}

pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
pi_usm_mem_properties *Properties, size_t Size,
pi_uint32 Alignment) {
PI_ASSERT(Context, PI_INVALID_CONTEXT);

// Check that incorrect bits are not set in the properties.
PI_ASSERT(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)),
PI_INVALID_VALUE);

ze_host_mem_alloc_desc_t ZeDesc = {};
ZeDesc.flags = 0;
// TODO: translate PI properties to Level Zero flags
ZE_CALL(zeMemAllocHost,
(Context->ZeContext, &ZeDesc, Size, Alignment, ResultPtr));

PI_ASSERT(Alignment == 0 ||
reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
PI_INVALID_VALUE);

return PI_SUCCESS;
}

static bool ShouldUseUSMAllocator() {
// Enable allocator by default if it's not explicitly disabled
return std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR") == nullptr;
}

static const bool UseUSMAllocator = ShouldUseUSMAllocator();

pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
pi_device Device,
pi_usm_mem_properties *Properties, size_t Size,
pi_uint32 Alignment) {
static pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
pi_device Device,
pi_usm_mem_properties *Properties,
size_t Size, pi_uint32 Alignment) {
PI_ASSERT(Context, PI_INVALID_CONTEXT);
PI_ASSERT(Device, PI_INVALID_DEVICE);

Expand All @@ -5442,10 +5432,10 @@ pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
return PI_SUCCESS;
}

pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
pi_device Device,
pi_usm_mem_properties *Properties, size_t Size,
pi_uint32 Alignment) {
static pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
pi_device Device,
pi_usm_mem_properties *Properties,
size_t Size, pi_uint32 Alignment) {
PI_ASSERT(Context, PI_INVALID_CONTEXT);
PI_ASSERT(Device, PI_INVALID_DEVICE);

Expand All @@ -5469,7 +5459,29 @@ pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
return PI_SUCCESS;
}

pi_result USMFreeImpl(pi_context Context, void *Ptr) {
static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
pi_usm_mem_properties *Properties,
size_t Size, pi_uint32 Alignment) {
PI_ASSERT(Context, PI_INVALID_CONTEXT);

// Check that incorrect bits are not set in the properties.
PI_ASSERT(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)),
PI_INVALID_VALUE);

// TODO: translate PI properties to Level Zero flags
ze_host_mem_alloc_desc_t ZeHostDesc = {};
ZeHostDesc.flags = 0;
ZE_CALL(zeMemAllocHost,
(Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr));

PI_ASSERT(Alignment == 0 ||
reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
PI_INVALID_VALUE);

return PI_SUCCESS;
}

static pi_result USMFreeImpl(pi_context Context, void *Ptr) {
ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
return PI_SUCCESS;
}
Expand All @@ -5495,6 +5507,11 @@ pi_result USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
Alignment);
}

pi_result USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
pi_uint32 Alignment) {
return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
}

void *USMMemoryAllocBase::allocate(size_t Size) {
void *Ptr = nullptr;

Expand Down Expand Up @@ -5545,6 +5562,8 @@ pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
} catch (const UsmAllocationException &Ex) {
*ResultPtr = nullptr;
return Ex.getError();
} catch (...) {
return PI_ERROR_UNKNOWN;
}

return PI_SUCCESS;
Expand Down Expand Up @@ -5572,6 +5591,34 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
} catch (const UsmAllocationException &Ex) {
*ResultPtr = nullptr;
return Ex.getError();
} catch (...) {
return PI_ERROR_UNKNOWN;
}

return PI_SUCCESS;
}

pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
pi_usm_mem_properties *Properties, size_t Size,
pi_uint32 Alignment) {
if (!UseUSMAllocator ||
// L0 spec says that allocation fails if Alignment != 2^n, in order to
// keep the same behavior for the allocator, just call L0 API directly and
// return the error code.
((Alignment & (Alignment - 1)) != 0)) {
return USMHostAllocImpl(ResultPtr, Context, Properties, Size, Alignment);
}

// There is a single allocator for Host USM allocations, so we don't need to
// find the allocator depending on context as we do for Shared and Device
// allocations.
try {
*ResultPtr = Context->HostMemAllocContext->allocate(Size, Alignment);
} catch (const UsmAllocationException &Ex) {
*ResultPtr = nullptr;
return Ex.getError();
} catch (...) {
return PI_ERROR_UNKNOWN;
}

return PI_SUCCESS;
Expand All @@ -5592,6 +5639,18 @@ pi_result piextUSMFree(pi_context Context, void *Ptr) {
(Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
&ZeDeviceHandle));

// If memory type is host release from host pool
if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) {
try {
Context->HostMemAllocContext->deallocate(Ptr);
} catch (const UsmAllocationException &Ex) {
return Ex.getError();
} catch (...) {
return PI_ERROR_UNKNOWN;
}
return PI_SUCCESS;
}

if (ZeDeviceHandle) {
// All devices in the context are of the same platform.
auto Platform = Context->Devices[0]->Platform;
Expand Down
19 changes: 18 additions & 1 deletion sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
: USMMemoryAllocBase(Ctx, Dev) {}
};

// Allocation routines for host memory type
class USMHostMemoryAlloc : public USMMemoryAllocBase {
protected:
pi_result allocateImpl(void **ResultPtr, size_t Size,
pi_uint32 Alignment) override;

public:
USMHostMemoryAlloc(pi_context Ctx) : USMMemoryAllocBase(Ctx, nullptr) {}
};

struct _pi_device : _pi_object {
_pi_device(ze_device_handle_t Device, pi_platform Plt,
bool isSubDevice = false)
Expand Down Expand Up @@ -196,6 +206,11 @@ struct _pi_context : _pi_object {
// NOTE: one must additionally call initialize() to complete
// PI context creation.
}
// Create USM allocator context for host. Device and Shared USM allocations
// are device-specific. Host allocations are not device-dependent therefore
// we don't need a map with device as key.
HostMemAllocContext = new USMAllocContext(
std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)));
}

// Initialize the PI context.
Expand Down Expand Up @@ -260,10 +275,12 @@ struct _pi_context : _pi_object {
pi_result decrementAliveEventsInPool(ze_event_pool_handle_t pool);

// Store USM allocator context(internal allocator structures)
// for USM shared/host and device allocations. There is 1 allocator context
// for USM shared and device allocations. There is 1 allocator context
// per each pair of (context, device) per each memory type.
std::unordered_map<pi_device, USMAllocContext> SharedMemAllocContexts;
std::unordered_map<pi_device, USMAllocContext> DeviceMemAllocContexts;
// Store the host allocator context. It does not depend on any device.
USMAllocContext *HostMemAllocContext;

private:
// Following member variables are used to manage assignment of events
Expand Down
Loading