diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index f5d30563bb163..258d137080bda 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -28,6 +28,7 @@ subject to change. Do not rely on these variables in production code. | SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images | | SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in Level Zero plugin (each memory request will go directly to Level Zero runtime) | +| SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR | MaxPoolableSize,Capacity,MaxPoolSize | Values specified as positive integers. Defaults are 1, 4, 256. MaxPoolableSize is the maximum allocation size in MB that may be pooled. Capacity is the number of allocations in each size range that are freed by the program but retained in the pool for reallocation. Size ranges follow this pattern: 32, 48, 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. MaxPoolSize is the maximum size of the pool in MB. | | SYCL_PI_LEVEL_ZERO_BATCH_SIZE | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. | | SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The default is 1. | diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 2b5270afb0fb3..3320bad74789a 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2458,7 +2458,6 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, } void *Ptr; - ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice; // We treat integrated devices (physical memory shared with the CPU) // differently from discrete devices (those with distinct memories). @@ -2478,20 +2477,33 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, // } + // Choose an alignment that is at most 64 and is the next power of 2 for sizes + // less than 64. + auto Alignment = Size; + if (Alignment > 32UL) + Alignment = 64UL; + else if (Alignment > 16UL) + Alignment = 32UL; + else if (Alignment > 8UL) + Alignment = 16UL; + else if (Alignment > 4UL) + Alignment = 8UL; + else if (Alignment > 2UL) + Alignment = 4UL; + else if (Alignment > 1UL) + Alignment = 2UL; + else + Alignment = 1UL; + + pi_result Result; if (DeviceIsIntegrated) { - ze_host_mem_alloc_desc_t ZeDesc = {}; - ZeDesc.flags = 0; - - ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, &Ptr)); - + Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment); } else { - ze_device_mem_alloc_desc_t ZeDesc = {}; - ZeDesc.flags = 0; - ZeDesc.ordinal = 0; - - ZE_CALL(zeMemAllocDevice, - (Context->ZeContext, &ZeDesc, Size, 1, ZeDevice, &Ptr)); + Result = piextUSMDeviceAlloc(&Ptr, Context, Context->Devices[0], nullptr, + Size, Alignment); } + if (Result != PI_SUCCESS) + return Result; if (HostPtr) { if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || @@ -2559,7 +2571,7 @@ pi_result piMemRelease(pi_mem Mem) { } else { auto Buf = static_cast<_pi_buffer *>(Mem); if (!Buf->isSubBuffer()) { - ZE_CALL(zeMemFree, (Mem->Context->ZeContext, Mem->getZeHandle())); + PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle())); } } delete Mem; @@ -5388,28 +5400,6 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, return mapError(ZeResult); } -pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, - pi_usm_mem_properties *Properties, size_t Size, - pi_uint32 Alignment) { - PI_ASSERT(Context, PI_INVALID_CONTEXT); - - // Check that incorrect bits are not set in the properties. - PI_ASSERT(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)), - PI_INVALID_VALUE); - - ze_host_mem_alloc_desc_t ZeDesc = {}; - ZeDesc.flags = 0; - // TODO: translate PI properties to Level Zero flags - ZE_CALL(zeMemAllocHost, - (Context->ZeContext, &ZeDesc, Size, Alignment, ResultPtr)); - - PI_ASSERT(Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0, - PI_INVALID_VALUE); - - return PI_SUCCESS; -} - static bool ShouldUseUSMAllocator() { // Enable allocator by default if it's not explicitly disabled return std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR") == nullptr; @@ -5417,10 +5407,10 @@ static bool ShouldUseUSMAllocator() { static const bool UseUSMAllocator = ShouldUseUSMAllocator(); -pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context, - pi_device Device, - pi_usm_mem_properties *Properties, size_t Size, - pi_uint32 Alignment) { +static pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { PI_ASSERT(Context, PI_INVALID_CONTEXT); PI_ASSERT(Device, PI_INVALID_DEVICE); @@ -5442,10 +5432,10 @@ pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context, return PI_SUCCESS; } -pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context, - pi_device Device, - pi_usm_mem_properties *Properties, size_t Size, - pi_uint32 Alignment) { +static pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { PI_ASSERT(Context, PI_INVALID_CONTEXT); PI_ASSERT(Device, PI_INVALID_DEVICE); @@ -5469,7 +5459,29 @@ pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context, return PI_SUCCESS; } -pi_result USMFreeImpl(pi_context Context, void *Ptr) { +static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { + PI_ASSERT(Context, PI_INVALID_CONTEXT); + + // Check that incorrect bits are not set in the properties. + PI_ASSERT(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)), + PI_INVALID_VALUE); + + // TODO: translate PI properties to Level Zero flags + ze_host_mem_alloc_desc_t ZeHostDesc = {}; + ZeHostDesc.flags = 0; + ZE_CALL(zeMemAllocHost, + (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr)); + + PI_ASSERT(Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0, + PI_INVALID_VALUE); + + return PI_SUCCESS; +} + +static pi_result USMFreeImpl(pi_context Context, void *Ptr) { ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); return PI_SUCCESS; } @@ -5495,6 +5507,11 @@ pi_result USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, Alignment); } +pi_result USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, + pi_uint32 Alignment) { + return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); +} + void *USMMemoryAllocBase::allocate(size_t Size) { void *Ptr = nullptr; @@ -5545,6 +5562,8 @@ pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, } catch (const UsmAllocationException &Ex) { *ResultPtr = nullptr; return Ex.getError(); + } catch (...) { + return PI_ERROR_UNKNOWN; } return PI_SUCCESS; @@ -5572,6 +5591,34 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, } catch (const UsmAllocationException &Ex) { *ResultPtr = nullptr; return Ex.getError(); + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + return PI_SUCCESS; +} + +pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, + pi_usm_mem_properties *Properties, size_t Size, + pi_uint32 Alignment) { + if (!UseUSMAllocator || + // L0 spec says that allocation fails if Alignment != 2^n, in order to + // keep the same behavior for the allocator, just call L0 API directly and + // return the error code. + ((Alignment & (Alignment - 1)) != 0)) { + return USMHostAllocImpl(ResultPtr, Context, Properties, Size, Alignment); + } + + // There is a single allocator for Host USM allocations, so we don't need to + // find the allocator depending on context as we do for Shared and Device + // allocations. + try { + *ResultPtr = Context->HostMemAllocContext->allocate(Size, Alignment); + } catch (const UsmAllocationException &Ex) { + *ResultPtr = nullptr; + return Ex.getError(); + } catch (...) { + return PI_ERROR_UNKNOWN; } return PI_SUCCESS; @@ -5592,6 +5639,18 @@ pi_result piextUSMFree(pi_context Context, void *Ptr) { (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, &ZeDeviceHandle)); + // If memory type is host release from host pool + if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) { + try { + Context->HostMemAllocContext->deallocate(Ptr); + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } catch (...) { + return PI_ERROR_UNKNOWN; + } + return PI_SUCCESS; + } + if (ZeDeviceHandle) { // All devices in the context are of the same platform. auto Platform = Context->Devices[0]->Platform; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 3b7612d3f38de..b74cda0146146 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -130,6 +130,16 @@ class USMDeviceMemoryAlloc : public USMMemoryAllocBase { : USMMemoryAllocBase(Ctx, Dev) {} }; +// Allocation routines for host memory type +class USMHostMemoryAlloc : public USMMemoryAllocBase { +protected: + pi_result allocateImpl(void **ResultPtr, size_t Size, + pi_uint32 Alignment) override; + +public: + USMHostMemoryAlloc(pi_context Ctx) : USMMemoryAllocBase(Ctx, nullptr) {} +}; + struct _pi_device : _pi_object { _pi_device(ze_device_handle_t Device, pi_platform Plt, bool isSubDevice = false) @@ -196,6 +206,11 @@ struct _pi_context : _pi_object { // NOTE: one must additionally call initialize() to complete // PI context creation. } + // Create USM allocator context for host. Device and Shared USM allocations + // are device-specific. Host allocations are not device-dependent therefore + // we don't need a map with device as key. + HostMemAllocContext = new USMAllocContext( + std::unique_ptr(new USMHostMemoryAlloc(this))); } // Initialize the PI context. @@ -260,10 +275,12 @@ struct _pi_context : _pi_object { pi_result decrementAliveEventsInPool(ze_event_pool_handle_t pool); // Store USM allocator context(internal allocator structures) - // for USM shared/host and device allocations. There is 1 allocator context + // for USM shared and device allocations. There is 1 allocator context // per each pair of (context, device) per each memory type. std::unordered_map SharedMemAllocContexts; std::unordered_map DeviceMemAllocContexts; + // Store the host allocator context. It does not depend on any device. + USMAllocContext *HostMemAllocContext; private: // Following member variables are used to manage assignment of events diff --git a/sycl/plugins/level_zero/usm_allocator.cpp b/sycl/plugins/level_zero/usm_allocator.cpp index 1596bf71bb427..e675c73ca8437 100644 --- a/sycl/plugins/level_zero/usm_allocator.cpp +++ b/sycl/plugins/level_zero/usm_allocator.cpp @@ -20,18 +20,40 @@ #include #include "usm_allocator.hpp" +#include +#include + +// USM allocations are a mimimum of 64KB in size even when a smaller size is +// requested. The implementation distinguishes between allocations of size +// ChunkCutOff (32KB) and those that are larger. +// Allocation requests smaller than ChunkCutoff use chunks taken from a single +// 64KB USM allocation. Thus, for example, for 8-byte allocations, only 1 in +// ~8000 requests results in a new USM allocation. Freeing results only in a +// chunk of a larger 64KB allocation to be marked as available and no real +// return to the system. An allocation is returned to the system only when all +// chunks in a 64KB allocation are freed by the program. +// Allocations larger than ChunkCutOff use a separate USM allocation for each +// request. These are subject to "pooling". That is, when such an allocation is +// freed by the program it is retained in a pool. The pool is available for +// future allocations, which means there are fewer actual USM +// allocations/deallocations. namespace settings { -// Size of the slab which is going to be requested from the system. -static constexpr size_t SlabSize = 64 * 1024; // 64K +// Minimum allocation size that will be requested from the system. +static constexpr size_t SlabMinSize = 64 * 1024; // 64KB + +// Allocations <= ChunkCutOff will use chunks from individual slabs. +// Allocations > ChunkCutOff will be rounded up to a multiple of +// SlabMinSize and allocated to occupy the whole slab. +static constexpr size_t ChunkCutOff = SlabMinSize / 2; // The largest size which is allocated via the allocator. // Allocations with size > CutOff bypass the USM allocator and // go directly to the runtime. -static constexpr size_t CutOff = SlabSize / 2; +static constexpr size_t CutOff = (size_t)1 << 31; // 2GB // Unfortunately we cannot deduce the size of the array, so every change // to the number of buckets should be reflected here. -using BucketsArrayType = std::array; +using BucketsArrayType = std::array; // Generates a list of bucket sizes used by the allocator. static constexpr BucketsArrayType generateBucketSizes() { @@ -41,7 +63,7 @@ static constexpr BucketsArrayType generateBucketSizes() { // allow to write this in a nicer way. // Simple helper to compute power of 2 -#define P(n) (1 << n) +#define P(n) (1ULL << n) BucketsArrayType Sizes = {32, 48, 64, 96, @@ -53,6 +75,22 @@ static constexpr BucketsArrayType generateBucketSizes() { P(12), P(12) + P(11), P(13), P(13) + P(12), P(14), P(14) + P(13), + P(15), P(15) + P(14), + P(16), P(16) + P(15), + P(17), P(17) + P(16), + P(18), P(18) + P(17), + P(19), P(19) + P(18), + P(20), P(20) + P(19), + P(21), P(21) + P(20), + P(22), P(22) + P(21), + P(23), P(23) + P(22), + P(24), P(24) + P(23), + P(25), P(25) + P(24), + P(26), P(26) + P(25), + P(27), P(27) + P(26), + P(28), P(28) + P(27), + P(29), P(29) + P(28), + P(30), P(30) + P(29), CutOff}; #undef P @@ -61,9 +99,55 @@ static constexpr BucketsArrayType generateBucketSizes() { static constexpr BucketsArrayType BucketSizes = generateBucketSizes(); -// The implementation expects that SlabSize is 2^n -static_assert((SlabSize & (SlabSize - 1)) == 0, - "SlabSize must be a power of 2"); +// The implementation expects that SlabMinSize is 2^n +static_assert((SlabMinSize & (SlabMinSize - 1)) == 0, + "SlabMinSize must be a power of 2"); + +// Protects the capacity checking of the pool. +static sycl::detail::SpinLock PoolLock; + +static class SetLimits { +public: + size_t MaxPoolableSize = 1; + size_t Capacity = 4; + size_t MaxPoolSize = 256; + size_t CurPoolSize = 0; + + SetLimits() { + // Parse optional parameters of this form (applicable to each context): + // SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR_SETTINGS=[][,[][,[]]] + // MaxPoolableSize: Maximum poolable allocation size, specified in MB. + // Default 1MB. + // Capacity: Number of pooled allocations in each bucket. + // Default 4. + // MaxPoolSize: Maximum size of pool, specified in MB. + // Default 256MB. + + char *PoolParams = getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR"); + if (PoolParams != nullptr) { + std::string Params(PoolParams); + size_t Pos = Params.find(','); + if (Pos != std::string::npos) { + if (Pos > 0) + MaxPoolableSize = std::stoi(Params.substr(0, Pos)); + Params.erase(0, Pos + 1); + Pos = Params.find(','); + if (Pos != std::string::npos) { + if (Pos > 0) + Capacity = std::stoi(Params.substr(0, Pos)); + Params.erase(0, Pos + 1); + if (Pos != std::string::npos) + MaxPoolSize = std::stoi(Params); + } else { + Capacity = std::stoi(Params); + } + } else + MaxPoolableSize = std::stoi(Params); + } + MaxPoolableSize *= (1 << 20); + MaxPoolSize *= (1 << 20); + } +} USMPoolSettings; } // namespace settings // Aligns the pointer down to the specified alignment @@ -93,7 +177,7 @@ static size_t AlignUp(size_t Val, size_t Alignment) { class Bucket; -// Represents the allocated memory block of size 'settings::SlabSize' +// Represents the allocated memory block of size 'settings::SlabMinSize' // Internally, it splits the memory block into chunks. The number of // chunks depends of the size of a Bucket which created the Slab. // The chunks @@ -101,7 +185,7 @@ class Bucket; // so no locking happens here. class Slab { - // Pointer to the allocated memory of SlabSize bytes + // Pointer to the allocated memory of SlabMinSize bytes void *MemPtr; // Represents the current state of each chunk: @@ -142,11 +226,15 @@ class Slab { size_t getNumAllocated() const { return NumAllocated; } - void *getFreeChunk(); + // Get pointer to allocation that is one piece of this slab. + void *getChunk(); + + // Get pointer to allocation that is this entire slab. + void *getSlab(); void *getPtr() const { return MemPtr; } void *getEnd() const { - return static_cast(getPtr()) + settings::SlabSize; + return static_cast(getPtr()) + settings::SlabMinSize; } size_t getChunkSize() const; @@ -180,17 +268,37 @@ class Bucket { Bucket(size_t Sz, USMAllocContext::USMAllocImpl &AllocCtx) : Size{Sz}, OwnAllocCtx{AllocCtx} {} + // Get pointer to allocation that is one piece of an available slab in this + // bucket. void *getChunk(); + // Get pointer to allocation that is a full slab in this bucket. + void *getSlab(); + size_t getSize() const { return Size; } + // Free an allocation that is one piece of a slab in this bucket. void freeChunk(void *Ptr, Slab &Slab); + + // Free an allocation that is a full slab in this bucket. + void freeSlab(void *Ptr, Slab &Slab); + SystemMemory &getMemHandle(); USMAllocContext::USMAllocImpl &getUsmAllocCtx() { return OwnAllocCtx; } + // Check whether an allocation to be freed can be placed in the pool. + bool CanPool(); + private: void onFreeChunk(Slab &); + + // Get a slab to be used for chunked allocations. + // These slabs are used for allocations <= ChunkCutOff and not pooled. decltype(AvailableSlabs.begin()) getAvailSlab(); + + // Get a slab that will be used as a whole for a single allocation. + // These slabs are > ChunkCutOff in size and pooled. + decltype(AvailableSlabs.begin()) getAvailFullSlab(); }; class USMAllocContext::USMAllocImpl { @@ -242,12 +350,14 @@ std::ostream &operator<<(std::ostream &Os, const Slab &Slab) { } Slab::Slab(Bucket &Bkt) - : MemPtr(Bkt.getMemHandle().allocate(settings::SlabSize)), - // In case if bucket size is not that SlabSize % b.getSize() == 0, we - // would have some padding at the end of the slab. - Chunks(settings::SlabSize / Bkt.getSize()), NumAllocated{0}, + : // In case bucket size is not a multiple of SlabMinSize, we would have + // some padding at the end of the slab. + Chunks(settings::SlabMinSize / Bkt.getSize()), NumAllocated{0}, bucket(Bkt), SlabListIter{}, FirstFreeChunkIdx{0} { - + size_t SlabAllocSize = Bkt.getSize(); + if (SlabAllocSize < settings::SlabMinSize) + SlabAllocSize = settings::SlabMinSize; + MemPtr = Bkt.getMemHandle().allocate(SlabAllocSize); regSlab(*this); } @@ -268,7 +378,7 @@ size_t Slab::FindFirstAvailableChunkIdx() const { return static_cast(-1); } -void *Slab::getFreeChunk() { +void *Slab::getChunk() { assert(NumAllocated != Chunks.size()); const size_t ChunkIdx = FindFirstAvailableChunkIdx(); @@ -286,6 +396,8 @@ void *Slab::getFreeChunk() { return FreeChunk; } +void *Slab::getSlab() { return getPtr(); } + Bucket &Slab::getBucket() { return bucket; } const Bucket &Slab::getBucket() const { return bucket; } @@ -320,16 +432,16 @@ void Slab::unregSlabByAddr(void *Addr, Slab &Slab) { } void Slab::regSlab(Slab &Slab) { - void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabSize); - void *EndAddr = static_cast(StartAddr) + settings::SlabSize; + void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabMinSize); + void *EndAddr = static_cast(StartAddr) + settings::SlabMinSize; regSlabByAddr(StartAddr, Slab); regSlabByAddr(EndAddr, Slab); } void Slab::unregSlab(Slab &Slab) { - void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabSize); - void *EndAddr = static_cast(StartAddr) + settings::SlabSize; + void *StartAddr = AlignPtrDown(Slab.getPtr(), settings::SlabMinSize); + void *EndAddr = static_cast(StartAddr) + settings::SlabMinSize; unregSlabByAddr(StartAddr, Slab); unregSlabByAddr(EndAddr, Slab); @@ -359,6 +471,47 @@ void Slab::freeChunk(void *Ptr) { bool Slab::hasAvail() { return NumAllocated != getNumChunks(); } +auto Bucket::getAvailFullSlab() -> decltype(AvailableSlabs.begin()) { + // Return a slab that will be used for a single allocation. + if (AvailableSlabs.size() == 0) { + auto It = AvailableSlabs.insert(AvailableSlabs.begin(), + std::make_unique(*this)); + (*It)->setIterator(It); + } else { + // If a slab was available in the pool then note that the current pooled + // size has reduced by the size of this slab. + settings::USMPoolSettings.CurPoolSize -= Size; + } + + return AvailableSlabs.begin(); +} + +void *Bucket::getSlab() { + std::lock_guard Lg(BucketLock); + + auto SlabIt = getAvailFullSlab(); + auto *FreeSlab = (*SlabIt)->getSlab(); + auto It = + UnavailableSlabs.insert(UnavailableSlabs.begin(), std::move(*SlabIt)); + AvailableSlabs.erase(SlabIt); + (*It)->setIterator(It); + return FreeSlab; +} + +void Bucket::freeSlab(void *Ptr, Slab &Slab) { + std::lock_guard Lg(BucketLock); + auto SlabIter = Slab.getIterator(); + assert(SlabIter != UnavailableSlabs.end()); + if (CanPool()) { + auto It = + AvailableSlabs.insert(AvailableSlabs.begin(), std::move(*SlabIter)); + UnavailableSlabs.erase(SlabIter); + (*It)->setIterator(It); + } else { + UnavailableSlabs.erase(SlabIter); + } +} + auto Bucket::getAvailSlab() -> decltype(AvailableSlabs.begin()) { if (AvailableSlabs.size() == 0) { auto It = AvailableSlabs.insert(AvailableSlabs.begin(), @@ -373,9 +526,9 @@ void *Bucket::getChunk() { std::lock_guard Lg(BucketLock); auto SlabIt = getAvailSlab(); - auto *FreeChunk = (*SlabIt)->getFreeChunk(); + auto *FreeChunk = (*SlabIt)->getChunk(); - // If the slab is full, move it to unavailable slabs and update its itreator + // If the slab is full, move it to unavailable slabs and update its iterator if (!((*SlabIt)->hasAvail())) { auto It = UnavailableSlabs.insert(UnavailableSlabs.begin(), std::move(*SlabIt)); @@ -409,15 +562,33 @@ void Bucket::onFreeChunk(Slab &Slab) { (*It)->setIterator(It); } - // Remove the slab when all the chunks from it are deallocated - // Note: since the slab is stored as unique_ptr, just remove it from - // the list to remove the list to destroy the object + // If slab has no chunks allocated we could pool it if capacity is available + // or release it to the system. if (Slab.getNumAllocated() == 0) { - auto It = Slab.getIterator(); - assert(It != AvailableSlabs.end()); + // Pool has no space so release it. + if (!CanPool()) { + // Remove the slab when all the chunks from it are deallocated + // Note: since the slab is stored as unique_ptr, just remove it from + // the list to remove the list to destroy the object + auto It = Slab.getIterator(); + assert(It != AvailableSlabs.end()); + + AvailableSlabs.erase(It); + } + } +} - AvailableSlabs.erase(It); +bool Bucket::CanPool() { + std::lock_guard Lock{settings::PoolLock}; + size_t NewFreeSlabsInBucket = AvailableSlabs.size() + 1; + if (settings::USMPoolSettings.Capacity >= NewFreeSlabsInBucket) { + size_t NewPoolSize = settings::USMPoolSettings.CurPoolSize + Size; + if (settings::USMPoolSettings.MaxPoolSize >= NewPoolSize) { + settings::USMPoolSettings.CurPoolSize = NewPoolSize; + return true; + } } + return false; } SystemMemory &Bucket::getMemHandle() { return OwnAllocCtx.getMemHandle(); } @@ -426,10 +597,16 @@ void *USMAllocContext::USMAllocImpl::allocate(size_t Size) { if (Size == 0) return nullptr; - if (Size > settings::CutOff) + if (Size > settings::USMPoolSettings.MaxPoolableSize) { return getMemHandle().allocate(Size); + } + + auto &Bucket = findBucket(Size); + if (Size > settings::ChunkCutOff) { + return Bucket.getSlab(); + } - return findBucket(Size).getChunk(); + return Bucket.getChunk(); } void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment) { @@ -441,13 +618,19 @@ void *USMAllocContext::USMAllocImpl::allocate(size_t Size, size_t Alignment) { size_t AlignedSize = (Size > 1) ? AlignUp(Size, Alignment) : Alignment; - // Check if our largest chunk is able to fit aligned size. + // Check if requested allocation size is within pooling limit. // If not, just request aligned pointer from the system. - if (AlignedSize > settings::CutOff) { + if (AlignedSize > settings::USMPoolSettings.MaxPoolableSize) { return getMemHandle().allocate(Size, Alignment); } - auto *Ptr = findBucket(AlignedSize).getChunk(); + void *Ptr; + auto &Bucket = findBucket(AlignedSize); + if (AlignedSize > settings::ChunkCutOff) { + Ptr = Bucket.getSlab(); + } else { + Ptr = Bucket.getChunk(); + } return AlignPtrUp(Ptr, Alignment); } @@ -464,7 +647,7 @@ Bucket &USMAllocContext::USMAllocImpl::findBucket(size_t Size) { } void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) { - auto *SlabPtr = AlignPtrDown(Ptr, settings::SlabSize); + auto *SlabPtr = AlignPtrDown(Ptr, settings::SlabMinSize); // Lock the map on read std::shared_lock Lk(getKnownSlabsMapLock()); @@ -481,12 +664,15 @@ void USMAllocContext::USMAllocImpl::deallocate(void *Ptr) { // protected by the lock, so it's safe to access it here. auto &Slab = It->second; if (Ptr >= Slab.getPtr() && Ptr < Slab.getEnd()) { - // Unlock the map before freeing the chunk, it may be locked on write // there Lk.unlock(); auto &Bucket = Slab.getBucket(); - Bucket.freeChunk(Ptr, Slab); + if (Bucket.getSize() <= settings::ChunkCutOff) { + Bucket.freeChunk(Ptr, Slab); + } else { + Bucket.freeSlab(Ptr, Slab); + } return; } } diff --git a/sycl/test/on-device/usm_pooling.cpp b/sycl/test/on-device/usm_pooling.cpp new file mode 100755 index 0000000000000..b7d968eecfa21 --- /dev/null +++ b/sycl/test/on-device/usm_pooling.cpp @@ -0,0 +1,127 @@ +// REQUIRES: level_zero + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out + +// Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB. + +// With no pooling: 1, 2, 3, 4, 5 allocs lead to ZE call. +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-NOPOOL + +// With pooling enabled and MaxPooolable=1MB: 1, 2, 3, 4, 5 allocs lead to ZE call. +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-12345 + +// With With pooling enabled and capacity=1: 1, 2, 4, 5 allocs lead to ZE call. +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,1 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,1 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,1 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 + +// With With pooling enabled and MaxPoolSize=2MB: 1, 2, 4, 5 allocs lead to ZE call. +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,,2 %t.out h 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,,2 %t.out d 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 +// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=2,,2 %t.out s 2> %t1.out; cat %t1.out %GPU_CHECK_PLACEHOLDER --check-prefix CHECK-1245 + +#include "CL/sycl.hpp" +using namespace sycl; + +constexpr size_t SIZE = 2 * 1024 * 1024; + +void test_host(context C) { + + void* ph1 = malloc_host(SIZE, C); + void* ph2 = malloc_host(SIZE, C); + free(ph1, C); + free(ph2, C); + void* ph3 = malloc_host(SIZE, C); + void* ph4 = malloc_host(SIZE, C); + void* ph5 = malloc_host(SIZE, C); + free(ph3, C); + free(ph4, C); + free(ph5, C); +} + +void test_device(context C, device D) { + + void* ph1 = malloc_device(SIZE, D, C); + void* ph2 = malloc_device(SIZE, D, C); + free(ph1, C); + free(ph2, C); + void* ph3 = malloc_device(SIZE, D, C); + void* ph4 = malloc_device(SIZE, D, C); + void* ph5 = malloc_device(SIZE, D, C); + free(ph3, C); + free(ph4, C); + free(ph5, C); +} + +void test_shared(context C, device D) { + + void* ph1 = malloc_shared(SIZE, D, C); + void* ph2 = malloc_shared(SIZE, D, C); + free(ph1, C); + free(ph2, C); + void* ph3 = malloc_shared(SIZE, D, C); + void* ph4 = malloc_shared(SIZE, D, C); + void* ph5 = malloc_shared(SIZE, D, C); + free(ph3, C); + free(ph4, C); + free(ph5, C); +} + +int main(int argc, char *argv[]) { + queue Q; + device D = Q.get_device(); + context C = Q.get_context(); + + const char *devType = D.is_host() ? "Host" : D.is_cpu() ? "CPU" : "GPU"; + std::string pluginName = + D.get_platform().get_info(); + std::cout << "Running on device " << devType << " (" + << D.get_info() << ") " << pluginName + << " plugin\n"; + + if (*argv[1] == 'h') { + std::cerr << "Test zeMemAllocHost\n"; + test_host(C); + } else if (*argv[1] == 'd') { + std::cerr << "Test zeMemAllocDevice\n"; + test_device(C, D); + } else if (*argv[1] == 's') { + std::cerr << "Test zeMemAllocShared\n"; + test_shared(C, D); + } + + return 0; +} + +// CHECK-NOPOOL: Test [[API:zeMemAllocHost|zeMemAllocDevice|zeMemAllocShared]] +// CHECK-NOPOOL-NEXT: ZE ---> [[API]]( +// CHECK-NOPOOL-NEXT: ZE ---> [[API]]( +// CHECK-NOPOOL-NEXT: ZE ---> zeMemFree +// CHECK-NOPOOL-NEXT: ZE ---> zeMemFree +// CHECK-NOPOOL-NEXT: ZE ---> [[API]]( +// CHECK-NOPOOL-NEXT: ZE ---> [[API]]( +// CHECK-NOPOOL-NEXT: ZE ---> [[API]]( + +// CHECK-12345: Test [[API:zeMemAllocHost|zeMemAllocDevice|zeMemAllocShared]] +// CHECK-12345-NEXT: ZE ---> [[API]]( +// CHECK-12345-NEXT: ZE ---> [[API]]( +// CHECK-12345-NEXT: ZE ---> zeMemGetAllocProperties +// CHECK-12345-NEXT: ZE ---> zeMemFree +// CHECK-12345-NEXT: ZE ---> zeMemGetAllocProperties +// CHECK-12345-NEXT: ZE ---> zeMemFree +// CHECK-12345-NEXT: ZE ---> [[API]]( +// CHECK-12345-NEXT: ZE ---> [[API]]( +// CHECK-12345-NEXT: ZE ---> [[API]]( + +// CHECK-1245: Test [[API:zeMemAllocHost|zeMemAllocDevice|zeMemAllocShared]] +// CHECK-1245-NEXT: ZE ---> [[API]]( +// CHECK-1245-NEXT: ZE ---> [[API]]( +// CHECK-1245-NEXT: ZE ---> zeMemGetAllocProperties +// CHECK-1245-NEXT: ZE ---> zeMemGetAllocProperties +// CHECK-1245-NEXT: ZE ---> zeMemFree +// CHECK-1245-NEXT: ZE ---> [[API]]( +// CHECK-1245-NEXT: ZE ---> [[API]](