Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Experimental support for L0 host ptr import. #4891

Merged
merged 17 commits into from
Jan 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/doc/EnvironmentVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ compiler and runtime.
| `SYCL_CACHE_MIN_DEVICE_IMAGE_SIZE` | Positive integer | Minimum size of device code image in bytes which is reasonable to cache on disk because disk access operation may take more time than do JIT compilation for it. Default value is 0 to cache all images. |
| `SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE` | Positive integer | Maximum size of device image in bytes which is cached. Too big kernels may overload disk too fast. Default value is 1 GB. |
| `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/PlatformContext/PlatformContext.adoc) extension to learn more. Enabled by default on Linux and disabled on Windows. |
| `SYCL_USM_HOSTPTR_IMPORT` | Integer | Enable by specifying non-zero value. Buffers created with a host pointer will result in host data promotion to USM, improving data transfer performance. To use this feature, also set SYCL_HOST_UNIFIED_MEMORY=1. |

`(*) Note: Any means this environment variable is effective when set to any non-null value.`

Expand Down
141 changes: 121 additions & 20 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1635,6 +1635,54 @@ static bool setEnvVar(const char *name, const char *value) {
return true;
}

static class ZeUSMImportExtension {
// Pointers to functions that import/release host memory into USM
ze_result_t (*zexDriverImportExternalPointer)(ze_driver_handle_t hDriver,
void *, size_t);
ze_result_t (*zexDriverReleaseImportedPointer)(ze_driver_handle_t, void *);

public:
// Whether user has requested Import/Release, and platform supports it.
bool Enabled;

ZeUSMImportExtension() : Enabled{false} {}

void setZeUSMImport(pi_platform Platform) {
// Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
// host ptr import during buffer creation.
const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT");
if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0)
return;

// Check if USM hostptr import feature is available.
ze_driver_handle_t driverHandle = Platform->ZeDriver;
if (ZE_CALL_NOCHECK(zeDriverGetExtensionFunctionAddress,
(driverHandle, "zexDriverImportExternalPointer",
reinterpret_cast<void **>(
&zexDriverImportExternalPointer))) == 0) {
ZE_CALL_NOCHECK(
zeDriverGetExtensionFunctionAddress,
(driverHandle, "zexDriverReleaseImportedPointer",
reinterpret_cast<void **>(&zexDriverReleaseImportedPointer)));
// Hostptr import/release is turned on because it has been requested
// by the env var, and this platform supports the APIs.
Enabled = true;
// Hostptr import is only possible if piMemBufferCreate receives a
// hostptr as an argument. The SYCL runtime passes a host ptr
// only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on.
setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1");
rdeodhar marked this conversation as resolved.
Show resolved Hide resolved
}
}
void doZeUSMImport(ze_driver_handle_t driverHandle, void *HostPtr,
size_t Size) {
ZE_CALL_NOCHECK(zexDriverImportExternalPointer,
(driverHandle, HostPtr, Size));
}
void doZeUSMRelease(ze_driver_handle_t driverHandle, void *HostPtr) {
ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (driverHandle, HostPtr));
}
} ZeUSMImport;
rdeodhar marked this conversation as resolved.
Show resolved Hide resolved

pi_result _pi_platform::initialize() {
// Cache driver properties
ZeStruct<ze_driver_properties_t> ZeDriverProperties;
Expand Down Expand Up @@ -1680,6 +1728,10 @@ pi_result _pi_platform::initialize() {
zeDriverExtensionMap[extension.name] = extension.version;
}

// Check if import user ptr into USM feature has been requested.
// If yes, then set up L0 API pointers if the platform supports it.
ZeUSMImport.setZeUSMImport(this);

return PI_SUCCESS;
}

Expand Down Expand Up @@ -1789,8 +1841,9 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms);
}

if (NumPlatforms)
if (NumPlatforms) {
*NumPlatforms = PiPlatformsCache->size();
}

zePrint("Using %s events\n",
ZeAllHostVisibleEvents ? "all host-visible" : "device-only");
Expand Down Expand Up @@ -3289,32 +3342,69 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
else
Alignment = 1UL;

pi_result Result = PI_SUCCESS;
// If USM Import feature is enabled and hostptr is supplied,
// import the hostptr if not already imported into USM.
rdeodhar marked this conversation as resolved.
Show resolved Hide resolved
// Data transfer rate is maximized when both source and destination
// are USM pointers. Promotion of the host pointer to USM thus
// optimizes data transfer performance.
bool HostPtrImported = false;
if (ZeUSMImport.Enabled && HostPtr != nullptr &&
(Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) {
// Query memory type of the host pointer
ze_device_handle_t ZeDeviceHandle;
ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
ZE_CALL(zeMemGetAllocProperties,
(Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties,
&ZeDeviceHandle));

// If not shared of any type, we can import the ptr
if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
// Promote the host ptr to USM host memory
ze_driver_handle_t driverHandle = Context->Devices[0]->Platform->ZeDriver;
ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size);
HostPtrImported = true;
}
}

pi_result Result;
if (DeviceIsIntegrated) {
if (enableBufferPooling()) {
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
} else
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
if (HostPtrImported) {
// When HostPtr is imported we use it for the buffer.
Ptr = HostPtr;
} else {
if (enableBufferPooling()) {
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
} else {
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
}
}
} else if (Context->SingleRootDevice) {
// If we have a single discrete device or all devices in the context are
// sub-devices of the same device then we can allocate on device
if (enableBufferPooling()) {
PI_CALL(piextUSMDeviceAlloc(&Ptr, Context, Context->SingleRootDevice,
nullptr, Size, Alignment));
} else
} else {
Result = ZeDeviceMemAllocHelper(&Ptr, Context, Context->SingleRootDevice,
Size);
}
} else {
// Context with several gpu cards. Temporarily use host allocation because
// it is accessible by all devices. But it is not good in terms of
// performance.
// TODO: We need to either allow remote access to device memory using IPC,
// or do explicit memory transfers from one device to another using host
// resources as backing buffers to allow those transfers.
if (enableBufferPooling()) {
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
} else
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
if (HostPtrImported) {
// When HostPtr is imported we use it for the buffer.
Ptr = HostPtr;
} else {
if (enableBufferPooling()) {
PI_CALL(piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment));
} else {
Result = ZeHostMemAllocHelper(&Ptr, Context, Size);
}
}
}

if (Result != PI_SUCCESS)
Expand All @@ -3325,8 +3415,10 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
// Initialize the buffer with user data
if (DeviceIsIntegrated) {
// Do a host to host copy
memcpy(Ptr, HostPtr, Size);
// Do a host to host copy.
// For an imported HostPtr the copy is unneeded.
if (!HostPtrImported)
memcpy(Ptr, HostPtr, Size);
} else if (Context->SingleRootDevice) {
// Initialize the buffer synchronously with immediate offload
ZE_CALL(zeCommandListAppendMemoryCopy,
Expand All @@ -3335,7 +3427,9 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
} else {
// Multiple root devices, do a host to host copy because we use a host
// allocation for this case.
memcpy(Ptr, HostPtr, Size);
// For an imported HostPtr the copy is unneeded.
if (!HostPtrImported)
memcpy(Ptr, HostPtr, Size);
}
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
// Nothing more to do.
Expand All @@ -3350,7 +3444,7 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
*RetMem = new _pi_buffer(
Context, pi_cast<char *>(Ptr) /* Level Zero Memory Handle */,
HostPtrOrNull, nullptr, 0, 0,
DeviceIsIntegrated /* allocation in host memory */);
DeviceIsIntegrated /* allocation in host memory */, HostPtrImported);
} catch (const std::bad_alloc &) {
return PI_OUT_OF_HOST_MEMORY;
} catch (...) {
Expand Down Expand Up @@ -3420,11 +3514,17 @@ pi_result piMemRelease(pi_mem Mem) {
} else {
auto Buf = static_cast<_pi_buffer *>(Mem);
if (!Buf->isSubBuffer()) {
if (enableBufferPooling()) {
PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
if (Mem->HostPtrImported) {
ze_driver_handle_t driverHandle =
Mem->Context->Devices[0]->Platform->ZeDriver;
smaslov-intel marked this conversation as resolved.
Show resolved Hide resolved
ZeUSMImport.doZeUSMRelease(driverHandle, Mem->MapHostPtr);
} else {
if (auto Res = ZeMemFreeHelper(Mem->Context, Mem->getZeHandle()))
return Res;
if (enableBufferPooling()) {
PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
} else {
if (auto Res = ZeMemFreeHelper(Mem->Context, Mem->getZeHandle()))
return Res;
}
}
}
}
Expand Down Expand Up @@ -5937,7 +6037,8 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer,

if (Buffer->MapHostPtr) {
*RetMap = Buffer->MapHostPtr + Offset;
if (!(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION))
if (!Buffer->HostPtrImported &&
!(MapFlags & PI_MAP_WRITE_INVALIDATE_REGION))
memcpy(*RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset, Size);
} else {
*RetMap = pi_cast<char *>(Buffer->getZeHandle()) + Offset;
Expand Down
15 changes: 10 additions & 5 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,9 @@ struct _pi_mem : _pi_object {
// Flag to indicate that this memory is allocated in host memory
bool OnHost;

// Flag to indicate that the host ptr has been imported into USM
bool HostPtrImported;
rdeodhar marked this conversation as resolved.
Show resolved Hide resolved

// Supplementary data to keep track of the mappings of this memory
// created with piEnqueueMemBufferMap and piEnqueueMemImageMap.
struct Mapping {
Expand Down Expand Up @@ -838,8 +841,10 @@ struct _pi_mem : _pi_object {
pi_result removeMapping(void *MappedTo, Mapping &MapInfo);

protected:
_pi_mem(pi_context Ctx, char *HostPtr, bool MemOnHost = false)
: Context{Ctx}, MapHostPtr{HostPtr}, OnHost{MemOnHost}, Mappings{} {}
_pi_mem(pi_context Ctx, char *HostPtr, bool MemOnHost = false,
bool ImportedHostPtr = false)
: Context{Ctx}, MapHostPtr{HostPtr}, OnHost{MemOnHost},
HostPtrImported{ImportedHostPtr}, Mappings{} {}

private:
// The key is the host pointer representing an active mapping.
Expand All @@ -856,9 +861,9 @@ struct _pi_buffer final : _pi_mem {
// Buffer/Sub-buffer constructor
_pi_buffer(pi_context Ctx, char *Mem, char *HostPtr,
_pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0,
bool MemOnHost = false)
: _pi_mem(Ctx, HostPtr, MemOnHost), ZeMem{Mem}, SubBuffer{Parent, Origin,
Size} {}
bool MemOnHost = false, bool ImportedHostPtr = false)
rdeodhar marked this conversation as resolved.
Show resolved Hide resolved
: _pi_mem(Ctx, HostPtr, MemOnHost, ImportedHostPtr), ZeMem{Mem},
SubBuffer{Parent, Origin, Size} {}

void *getZeHandle() override { return ZeMem; }

Expand Down