Skip to content

Commit 717967c

Browse files
authored
[SYCL] optimize enqueueImpKernel by inlining (#20681)
_[functional change]:_ **GlobalHandler instance move** from a static variable declared inside a method to global scope. This **changes when the first object is created**. Previously it would only be created if getInstancePtr was called, now it happens at the time of loading the library. Users pay for the instance even if they never use the global handle. Motivation is performance - accessing global variable declared inside a function can be slower than to variable in global scope. Explained e.g. [here](https://stackoverflow.com/questions/52198322/is-access-to-a-static-function-variable-slower-than-access-to-a-global-variable/52202005#52202005) _[non-functional-change]_: **various functions** exising on hot submit-kernel path **get inlined** This PR is part of #20668 which was split into two. See plots in that PR desciption for performance comparisons.
1 parent ed3809b commit 717967c

File tree

10 files changed

+98
-141
lines changed

10 files changed

+98
-141
lines changed

sycl/source/detail/context_impl.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -263,22 +263,6 @@ context_impl::get_backend_info<info::device::backend_version>() const {
263263
}
264264
#endif
265265

266-
ur_context_handle_t &context_impl::getHandleRef() { return MContext; }
267-
const ur_context_handle_t &context_impl::getHandleRef() const {
268-
return MContext;
269-
}
270-
271-
KernelProgramCache &context_impl::getKernelProgramCache() const {
272-
return MKernelProgramCache;
273-
}
274-
275-
bool context_impl::hasDevice(const detail::device_impl &Device) const {
276-
for (device_impl *D : MDevices)
277-
if (D == &Device)
278-
return true;
279-
return false;
280-
}
281-
282266
device_impl *
283267
context_impl::findMatchingDeviceImpl(ur_device_handle_t &DeviceUR) const {
284268
for (device_impl *D : MDevices)

sycl/source/detail/context_impl.hpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class context_impl : public std::enable_shared_from_this<context_impl> {
116116
/// reference will be invalid if context_impl was destroyed.
117117
///
118118
/// \return an instance of raw UR context handle.
119-
ur_context_handle_t &getHandleRef();
119+
ur_context_handle_t &getHandleRef() { return MContext; }
120120

121121
/// Gets the underlying context object (if any) without reference count
122122
/// modification.
@@ -126,7 +126,7 @@ class context_impl : public std::enable_shared_from_this<context_impl> {
126126
/// reference will be invalid if context_impl was destroyed.
127127
///
128128
/// \return an instance of raw UR context handle.
129-
const ur_context_handle_t &getHandleRef() const;
129+
const ur_context_handle_t &getHandleRef() const { return MContext; }
130130

131131
devices_range getDevices() const { return MDevices; }
132132

@@ -151,10 +151,15 @@ class context_impl : public std::enable_shared_from_this<context_impl> {
151151
return {MCachedLibPrograms, MCachedLibProgramsMutex};
152152
}
153153

154-
KernelProgramCache &getKernelProgramCache() const;
154+
KernelProgramCache &getKernelProgramCache() const {
155+
return MKernelProgramCache;
156+
}
155157

156158
/// Returns true if and only if context contains the given device.
157-
bool hasDevice(const detail::device_impl &Device) const;
159+
bool hasDevice(const detail::device_impl &Device) const {
160+
return std::any_of(MDevices.begin(), MDevices.end(),
161+
[&](auto *D) { return D == &Device; });
162+
}
158163

159164
/// Returns true if and only if the device can be used within this context.
160165
/// For OpenCL this is currently equivalent to hasDevice, for other backends

sycl/source/detail/device_kernel_info.cpp

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -73,24 +73,6 @@ void DeviceKernelInfo::setCompileTimeInfoIfNeeded(
7373
assert(Info == *this);
7474
}
7575

76-
FastKernelSubcacheT &DeviceKernelInfo::getKernelSubcache() {
77-
assertInitialized();
78-
return MFastKernelSubcache;
79-
}
80-
81-
const std::optional<int> &DeviceKernelInfo::getImplicitLocalArgPos() {
82-
assertInitialized();
83-
return MImplicitLocalArgPos;
84-
}
85-
86-
bool DeviceKernelInfo::isCompileTimeInfoSet() const { return KernelSize != 0; }
87-
88-
void DeviceKernelInfo::assertInitialized() {
89-
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
90-
assert(MInitialized.load() && "Data needs to be initialized before use");
91-
#endif
92-
}
93-
9476
} // namespace detail
9577
} // namespace _V1
9678
} // namespace sycl

sycl/source/detail/device_kernel_info.hpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,23 @@ class DeviceKernelInfo : public CompileTimeKernelInfoTy {
107107
#endif
108108
void setCompileTimeInfoIfNeeded(const CompileTimeKernelInfoTy &Info);
109109

110-
FastKernelSubcacheT &getKernelSubcache();
111-
const std::optional<int> &getImplicitLocalArgPos();
110+
FastKernelSubcacheT &getKernelSubcache() {
111+
assertInitialized();
112+
return MFastKernelSubcache;
113+
}
114+
115+
std::optional<int> getImplicitLocalArgPos() const {
116+
assertInitialized();
117+
return MImplicitLocalArgPos;
118+
}
112119

113120
private:
114-
void assertInitialized();
115-
bool isCompileTimeInfoSet() const;
121+
void assertInitialized() const {
122+
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
123+
assert(MInitialized.load() && "Data needs to be initialized before use");
124+
#endif
125+
}
126+
bool isCompileTimeInfoSet() const { return KernelSize != 0; }
116127

117128
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
118129
std::atomic<bool> MInitialized = false;

sycl/source/detail/global_handler.cpp

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -105,20 +105,7 @@ void GlobalHandler::TraceEventXPTI(const char *Message) {
105105
#endif
106106
}
107107

108-
GlobalHandler *&GlobalHandler::getInstancePtr() {
109-
static GlobalHandler *RTGlobalObjHandler = new GlobalHandler();
110-
return RTGlobalObjHandler;
111-
}
112-
113-
GlobalHandler &GlobalHandler::instance() {
114-
GlobalHandler *RTGlobalObjHandler = GlobalHandler::getInstancePtr();
115-
assert(RTGlobalObjHandler && "Handler must not be deallocated earlier");
116-
return *RTGlobalObjHandler;
117-
}
118-
119-
bool GlobalHandler::isInstanceAlive() {
120-
return GlobalHandler::getInstancePtr();
121-
}
108+
GlobalHandler *GlobalHandler::RTGlobalObjHandler = new GlobalHandler();
122109

123110
template <typename T, typename... Types>
124111
T &GlobalHandler::getOrCreate(InstWithLock<T> &IWL, Types &&...Args) {
@@ -331,8 +318,7 @@ void GlobalHandler::drainThreadPool() {
331318
// 2) when process is being terminated
332319
void shutdown_early(bool CanJoinThreads = true) {
333320
const LockGuard Lock{GlobalHandler::MSyclGlobalHandlerProtector};
334-
GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
335-
if (!Handler)
321+
if (!GlobalHandler::RTGlobalObjHandler)
336322
return;
337323

338324
#if defined(XPTI_ENABLE_INSTRUMENTATION) && defined(_WIN32)
@@ -342,26 +328,26 @@ void shutdown_early(bool CanJoinThreads = true) {
342328
#endif
343329

344330
// Now that we are shutting down, we will no longer defer MemObj releases.
345-
Handler->endDeferredRelease();
331+
GlobalHandler::RTGlobalObjHandler->endDeferredRelease();
346332

347333
// Ensure neither host task is working so that no default context is accessed
348334
// upon its release
349-
Handler->prepareSchedulerToRelease(true);
335+
GlobalHandler::RTGlobalObjHandler->prepareSchedulerToRelease(true);
350336

351-
if (Handler->MHostTaskThreadPool.Inst) {
352-
Handler->MHostTaskThreadPool.Inst->finishAndWait(CanJoinThreads);
353-
Handler->MHostTaskThreadPool.Inst.reset(nullptr);
337+
if (GlobalHandler::RTGlobalObjHandler->MHostTaskThreadPool.Inst) {
338+
GlobalHandler::RTGlobalObjHandler->MHostTaskThreadPool.Inst->finishAndWait(
339+
CanJoinThreads);
340+
GlobalHandler::RTGlobalObjHandler->MHostTaskThreadPool.Inst.reset(nullptr);
354341
}
355342

356343
// This releases OUR reference to the default context, but
357344
// other may yet have refs
358-
Handler->releaseDefaultContexts();
345+
GlobalHandler::RTGlobalObjHandler->releaseDefaultContexts();
359346
}
360347

361348
void shutdown_late() {
362349
const LockGuard Lock{GlobalHandler::MSyclGlobalHandlerProtector};
363-
GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
364-
if (!Handler)
350+
if (!GlobalHandler::RTGlobalObjHandler)
365351
return;
366352

367353
#if defined(XPTI_ENABLE_INSTRUMENTATION) && defined(_WIN32)
@@ -371,26 +357,27 @@ void shutdown_late() {
371357
#endif
372358

373359
// First, release resources, that may access adapters.
374-
Handler->MPlatformCache.Inst.reset(nullptr);
375-
Handler->MScheduler.Inst.reset(nullptr);
376-
Handler->MProgramManager.Inst.reset(nullptr);
360+
GlobalHandler::RTGlobalObjHandler->MPlatformCache.Inst.reset(nullptr);
361+
GlobalHandler::RTGlobalObjHandler->MScheduler.Inst.reset(nullptr);
362+
GlobalHandler::RTGlobalObjHandler->MProgramManager.Inst.reset(nullptr);
377363

378364
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
379365
// Kernel cache, which is part of device kernel info,
380366
// stores handles to the adapter, so clear it before releasing adapters.
381-
Handler->MDeviceKernelInfoStorage.Inst.reset(nullptr);
367+
GlobalHandler::RTGlobalObjHandler->MDeviceKernelInfoStorage.Inst.reset(
368+
nullptr);
382369
#endif
383370

384371
// Clear the adapters and reset the instance if it was there.
385-
Handler->unloadAdapters();
386-
if (Handler->MAdapters.Inst)
387-
Handler->MAdapters.Inst.reset(nullptr);
372+
GlobalHandler::RTGlobalObjHandler->unloadAdapters();
373+
if (GlobalHandler::RTGlobalObjHandler->MAdapters.Inst)
374+
GlobalHandler::RTGlobalObjHandler->MAdapters.Inst.reset(nullptr);
388375

389-
Handler->MXPTIRegistry.Inst.reset(nullptr);
376+
GlobalHandler::RTGlobalObjHandler->MXPTIRegistry.Inst.reset(nullptr);
390377

391378
// Release the rest of global resources.
392-
delete Handler;
393-
Handler = nullptr;
379+
delete GlobalHandler::RTGlobalObjHandler;
380+
GlobalHandler::RTGlobalObjHandler = nullptr;
394381
}
395382

396383
#ifdef _WIN32

sycl/source/detail/global_handler.hpp

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,11 @@ class DeviceKernelInfo;
4848
/// construction or destruction is generated anyway.
4949
class GlobalHandler {
5050
public:
51-
/// \return a reference to a GlobalHandler singleton instance. Memory for
52-
/// storing objects is allocated on first call. The reference is valid as long
53-
/// as runtime library is loaded (i.e. untill `DllMain` or
51+
static bool isInstanceAlive() { return RTGlobalObjHandler != nullptr; }
52+
/// \return a reference to a GlobalHandler singleton instance. The reference
53+
/// is valid as long as runtime library is loaded (i.e. untill `DllMain` or
5454
/// `__attribute__((destructor))` is called).
55-
static GlobalHandler &instance();
56-
57-
/// \return true if the instance has not been deallocated yet.
58-
static bool isInstanceAlive();
55+
static GlobalHandler &instance() { return *RTGlobalObjHandler; }
5956

6057
GlobalHandler(const GlobalHandler &) = delete;
6158
GlobalHandler(GlobalHandler &&) = delete;
@@ -96,19 +93,18 @@ class GlobalHandler {
9693
void attachScheduler(Scheduler *Scheduler);
9794

9895
private:
96+
// Constructor and destructor are declared out-of-line to allow incomplete
97+
// types as template arguments to unique_ptr.
98+
GlobalHandler();
99+
~GlobalHandler();
100+
99101
bool OkToDefer = true;
100102

101103
friend void shutdown_early(bool);
102104
friend void shutdown_late();
103105
friend class ObjectUsageCounter;
104-
static GlobalHandler *&getInstancePtr();
105106
static SpinLock MSyclGlobalHandlerProtector;
106107

107-
// Constructor and destructor are declared out-of-line to allow incomplete
108-
// types as template arguments to unique_ptr.
109-
GlobalHandler();
110-
~GlobalHandler();
111-
112108
template <typename T> struct InstWithLock {
113109
std::unique_ptr<T> Inst;
114110
SpinLock Lock;
@@ -135,7 +131,10 @@ class GlobalHandler {
135131
#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
136132
InstWithLock<std::deque<DeviceKernelInfo>> MDeviceKernelInfoStorage;
137133
#endif
134+
135+
static GlobalHandler *RTGlobalObjHandler;
138136
};
137+
139138
} // namespace detail
140139
} // namespace _V1
141140
} // namespace sycl

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,6 @@ static void enableITTAnnotationsIfNeeded(const ur_program_handle_t &Prog,
6666
}
6767
}
6868

69-
ProgramManager &ProgramManager::getInstance() {
70-
return GlobalHandler::instance().getProgramManager();
71-
}
72-
7369
static Managed<ur_program_handle_t>
7470
createBinaryProgram(context_impl &Context, devices_range Devices,
7571
const uint8_t **Binaries, size_t *Lengths,
@@ -1805,14 +1801,6 @@ void ProgramManager::cacheKernelImplicitLocalArg(
18051801
}
18061802
}
18071803

1808-
std::optional<int>
1809-
ProgramManager::kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const {
1810-
auto it = m_KernelImplicitLocalArgPos.find(KernelName);
1811-
if (it != m_KernelImplicitLocalArgPos.end())
1812-
return it->second;
1813-
return {};
1814-
}
1815-
18161804
DeviceKernelInfo &ProgramManager::getOrCreateDeviceKernelInfo(
18171805
const CompileTimeKernelInfoTy &Info) {
18181806
std::lock_guard<std::mutex> Guard(m_DeviceKernelInfoMapMutex);
@@ -2344,24 +2332,6 @@ ProgramManager::getBinImageState(const RTDeviceBinaryImage *BinImage) {
23442332
: sycl::bundle_state::object;
23452333
}
23462334

2347-
std::optional<kernel_id>
2348-
ProgramManager::tryGetSYCLKernelID(KernelNameStrRefT KernelName) {
2349-
std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
2350-
2351-
auto KernelID = m_KernelName2KernelIDs.find(KernelName);
2352-
if (KernelID == m_KernelName2KernelIDs.end())
2353-
return std::nullopt;
2354-
2355-
return KernelID->second;
2356-
}
2357-
2358-
kernel_id ProgramManager::getSYCLKernelID(KernelNameStrRefT KernelName) {
2359-
if (std::optional<kernel_id> MaybeKernelID = tryGetSYCLKernelID(KernelName))
2360-
return *MaybeKernelID;
2361-
throw exception(make_error_code(errc::runtime),
2362-
"No kernel found with the specified name");
2363-
}
2364-
23652335
bool ProgramManager::hasCompatibleImage(const device_impl &DeviceImpl) {
23662336
std::lock_guard<std::mutex> Guard(m_KernelIDsMutex);
23672337

sycl/source/detail/program_manager/program_manager.hpp

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,9 @@ class ProgramManager {
134134
public:
135135
// Returns the single instance of the program manager for the entire
136136
// process. Can only be called after staticInit is done.
137-
static ProgramManager &getInstance();
137+
static ProgramManager &getInstance() {
138+
return GlobalHandler::instance().getProgramManager();
139+
}
138140

139141
const RTDeviceBinaryImage &getDeviceImage(KernelNameStrRefT KernelName,
140142
context_impl &ContextImpl,
@@ -236,11 +238,24 @@ class ProgramManager {
236238

237239
// The function returns the unique SYCL kernel identifier associated with a
238240
// kernel name or nullopt if there is no such ID.
239-
std::optional<kernel_id> tryGetSYCLKernelID(KernelNameStrRefT KernelName);
241+
std::optional<kernel_id> tryGetSYCLKernelID(KernelNameStrRefT KernelName) {
242+
std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
243+
244+
auto KernelID = m_KernelName2KernelIDs.find(KernelName);
245+
if (KernelID == m_KernelName2KernelIDs.end())
246+
return std::nullopt;
247+
248+
return KernelID->second;
249+
}
240250

241251
// The function returns the unique SYCL kernel identifier associated with a
242252
// kernel name or throws a sycl exception if there is no such ID.
243-
kernel_id getSYCLKernelID(KernelNameStrRefT KernelName);
253+
kernel_id getSYCLKernelID(KernelNameStrRefT KernelName) {
254+
if (std::optional<kernel_id> MaybeKernelID = tryGetSYCLKernelID(KernelName))
255+
return *MaybeKernelID;
256+
throw exception(make_error_code(errc::runtime),
257+
"No kernel found with the specified name");
258+
}
244259

245260
// The function returns a vector containing all unique SYCL kernel identifiers
246261
// in SYCL device images.
@@ -375,7 +390,12 @@ class ProgramManager {
375390
SanitizerType kernelUsesSanitizer() const { return m_SanitizerFoundInImage; }
376391

377392
std::optional<int>
378-
kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const;
393+
kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const {
394+
auto it = m_KernelImplicitLocalArgPos.find(KernelName);
395+
if (it != m_KernelImplicitLocalArgPos.end())
396+
return it->second;
397+
return {};
398+
}
379399

380400
DeviceKernelInfo &
381401
getOrCreateDeviceKernelInfo(const CompileTimeKernelInfoTy &Info);
@@ -562,6 +582,7 @@ class ProgramManager {
562582

563583
friend class ::ProgramManagerTest;
564584
};
585+
565586
} // namespace detail
566587
} // namespace _V1
567588
} // namespace sycl

0 commit comments

Comments
 (0)